From f1d390bdde7e8bb9bee36596bcfa259bf61de22e Mon Sep 17 00:00:00 2001 From: Holger Kohr Date: Fri, 2 Nov 2018 20:07:50 +0100 Subject: [PATCH 01/10] Simplify mnist Gluon tutorial and add mislabelled sample plotting --- docs/tutorials/gluon/mnist.md | 138 ++++++++++++++++++++++------------ 1 file changed, 91 insertions(+), 47 deletions(-) diff --git a/docs/tutorials/gluon/mnist.md b/docs/tutorials/gluon/mnist.md index 5b8a98a3d668..d7e11ff82f4a 100644 --- a/docs/tutorials/gluon/mnist.md +++ b/docs/tutorials/gluon/mnist.md @@ -14,10 +14,10 @@ imperative fashion. This is based on the Mnist tutorial with symbolic approach. You can find it [here](http://mxnet.io/tutorials/python/mnist.html). ## Prerequisites + To complete this tutorial, we need: - MXNet. See the instructions for your operating system in [Setup and Installation](http://mxnet.io/install/index.html). - - [Python Requests](http://docs.python-requests.org/en/master/) and [Jupyter Notebook](http://jupyter.org/index.html). ``` @@ -100,8 +100,7 @@ We will use [Trainer](http://mxnet.io/api/python/gluon/gluon.html#trainer) class initialized parameters. ```python -gpus = mx.test_utils.list_gpus() -ctx = [mx.gpu()] if gpus else [mx.cpu(0), mx.cpu(1)] +ctx = mx.gpu(0) if mx.context.num_gpus() > 0 else mx.cpu(0) net.initialize(mx.init.Xavier(magnitude=2.24), ctx=ctx) trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': 0.02}) ``` @@ -126,41 +125,39 @@ training scope which is defined by `autograd.record()`. ```python %%time -epoch = 10 -# Use Accuracy as the evaluation metric. + +num_epochs = 10 metric = mx.metric.Accuracy() softmax_cross_entropy_loss = gluon.loss.SoftmaxCrossEntropyLoss() -for i in range(epoch): - # Reset the train data iterator. +for epoch in range(num_epochs): + # Restart the training data iterator at the beginning of each epoch train_data.reset() - # Loop over the train data iterator. + for batch in train_data: - # Splits train data into multiple slices along batch_axis - # and copy each slice into a context. - data = gluon.utils.split_and_load(batch.data[0], ctx_list=ctx, batch_axis=0) - # Splits train labels into multiple slices along batch_axis - # and copy each slice into a context. - label = gluon.utils.split_and_load(batch.label[0], ctx_list=ctx, batch_axis=0) - outputs = [] - # Inside training scope + # Possibly copy data and labels to the GPU + data = batch.data[0].copyto(ctx) + labels = batch.label[0].copyto(ctx) + + # The forward pass and the loss computation need to be wrapped + # in an `ag.record()` scope to indicate that the results will + # be needed in the backward pass (gradient computation). with ag.record(): - for x, y in zip(data, label): - z = net(x) - # Computes softmax cross entropy loss. - loss = softmax_cross_entropy_loss(z, y) - # Backpropagate the error for one iteration. - loss.backward() - outputs.append(z) - # Updates internal evaluation - metric.update(label, outputs) - # Make one step of parameter update. Trainer needs to know the - # batch size of data to normalize the gradient by 1/batch_size. - trainer.step(batch.data[0].shape[0]) - # Gets the evaluation result. + out = net(data) + loss = softmax_cross_entropy_loss(out, labels) + + # Compute gradients by backpropagation and update the evaluation + # metric + loss.backward() + metric.update(labels, out) + + # Update the parameters by stepping the trainer; the batch size + # is required to normalize the gradients by `1 / batch_size`. + trainer.step(batch_size=batch.data[0].shape[0]) + + # Print the evaluation metric and reset it for the next epoch name, acc = metric.get() - # Reset evaluation result to initial state. + print('After epoch {}: {} = {}'.format(epoch + 1, name, acc)) metric.reset() - print('training acc at epoch %d: %s=%f'%(i, name, acc)) ``` #### Prediction @@ -168,29 +165,76 @@ for i in range(epoch): After the above training completes, we can evaluate the trained model by running predictions on validation dataset. Since the dataset also has labels for all test images, we can compute the accuracy metric over validation data as follows: ```python -# Use Accuracy as the evaluation metric. metric = mx.metric.Accuracy() -# Reset the validation data iterator. val_data.reset() -# Loop over the validation data iterator. for batch in val_data: - # Splits validation data into multiple slices along batch_axis - # and copy each slice into a context. - data = gluon.utils.split_and_load(batch.data[0], ctx_list=ctx, batch_axis=0) - # Splits validation label into multiple slices along batch_axis - # and copy each slice into a context. - label = gluon.utils.split_and_load(batch.label[0], ctx_list=ctx, batch_axis=0) - outputs = [] - for x in data: - outputs.append(net(x)) - # Updates internal evaluation - metric.update(label, outputs) -print('validation acc: %s=%f'%metric.get()) + # Possibly copy data and labels to the GPU + data = batch.data[0].copyto(ctx) + labels = batch.label[0].copyto(ctx) + metric.update(labels, net(data)) +print('Validaton: {} = {}'.format(*metric.get())) assert metric.get()[1] > 0.94 ``` If everything went well, we should see an accuracy value that is around 0.96, which means that we are able to accurately predict the digit in 96% of test images. This is a pretty good result. But as we will see in the next part of this tutorial, we can do a lot better than that. +That said, a single numer only gives us very limited information on the performance of our neural network. It is always a good idea to actually look at the images on which the network performed poorly, and check for clues on how to improve the performance. We do that with the help of a small function that produces a list of the images where the network got it wrong, together with the predicted and true labels: + +```python +def get_mislabelled(it): + """Return list of ``(input, pred_lbl, true_lbl)`` for mislabelled samples.""" + mislabelled = [] + it.reset() + for batch in it: + data = batch.data[0].copyto(ctx) + labels = batch.label[0].copyto(ctx) + out = net(data) + # Predicted label is the index is where the output is maximal + preds = nd.argmax(out, axis=1) + for d, p, l in zip(data, preds, labels): + if p != l: + mislabelled.append( + (d.asnumpy(), int(p.asnumpy()), int(l.asnumpy())) + ) + return mislabelled +``` + +We can now get the mislabelled images in the training and validation sets and plot a selection of them: + +```python +import numpy as np + +sample_size = 8 +wrong_train = get_mislabelled(train_data) +wrong_val = get_mislabelled(val_data) +wrong_train_sample = [wrong_train[i] for i in np.random.randint(0, len(wrong_train), size=sample_size)] +wrong_val_sample = [wrong_val[i] for i in np.random.randint(0, len(wrong_val), size=sample_size)] + +import matplotlib.pyplot as plt + +fig, axs = plt.subplots(ncols=sample_size) +for ax, (img, pred, lbl) in zip(axs, wrong_train_sample): + fig.set_size_inches(18, 4) + fig.suptitle("Sample of wrong predictions in the training set", fontsize=20) + ax.imshow(img[0], cmap="gray") + ax.set_title("Predicted: {}\nActual: {}".format(pred, lbl)) + ax.xaxis.set_visible(False) + ax.yaxis.set_visible(False) + +fig, axs = plt.subplots(ncols=sample_size) +for ax, (img, pred, lbl) in zip(axs, wrong_val_sample): + fig.set_size_inches(18, 4) + fig.suptitle("Sample of wrong predictions in the validation set", fontsize=20) + ax.imshow(img[0], cmap="gray") + ax.set_title("Predicted: {}\nActual: {}".format(pred, lbl)) + ax.xaxis.set_visible(False) + ax.yaxis.set_visible(False) +``` +![png](./wrong_train.png) +![png](./wrong_val.png) + +In this case, it is rather obvious that our MLP network is too simple to perform really great on this dataset, as can be seen from the fact that some of the mislabelled examples are rather "simple" and should not be a challenge for our neural net. As it turns out, moving to the CNN architecture presented in the following section will give a big performance boost. + ### Convolutional Neural Network Earlier, we briefly touched on a drawback of MLP when we said we need to discard the input image's original shape and flatten it as a vector before we can feed it as input to the MLP's first fully connected layer. Turns out this is an important issue because we don't take advantage of the fact that pixels in the image have natural spatial correlation along the horizontal and vertical axes. A convolutional neural network (CNN) aims to address this problem by using a more structured weight representation. Instead of flattening the image and doing a simple matrix-matrix multiplication, it employs one or more convolutional layers that each performs a 2-D convolution on the input image. @@ -265,7 +309,7 @@ trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': 0.03}) metric = mx.metric.Accuracy() softmax_cross_entropy_loss = gluon.loss.SoftmaxCrossEntropyLoss() -for i in range(epoch): +for i in range(num_epochs): # Reset the train data iterator. train_data.reset() # Loop over the train data iterator. From 5df5071e12480db031cb87322b778dacf8f94ed2 Mon Sep 17 00:00:00 2001 From: Holger Date: Fri, 2 Nov 2018 20:11:34 +0100 Subject: [PATCH 02/10] Add mnist Gluon tutorial images --- docs/tutorials/gluon/wrong_train.png | Bin 0 -> 15741 bytes docs/tutorials/gluon/wrong_val.png | Bin 0 -> 15438 bytes 2 files changed, 0 insertions(+), 0 deletions(-) create mode 100644 docs/tutorials/gluon/wrong_train.png create mode 100644 docs/tutorials/gluon/wrong_val.png diff --git a/docs/tutorials/gluon/wrong_train.png b/docs/tutorials/gluon/wrong_train.png new file mode 100644 index 0000000000000000000000000000000000000000..2ddf291e944490ff1e375c615057ab87e96c6e43 GIT binary patch literal 15741 zcmdVBcUTi?+c%7>fDK{Sf(t@Wnn+XWT||L^z$(&9R3J!=5PFNkDosE`38AhUK%`0U zWL?1!5`qK)saa_uR1<1QNWO{pQ{L~ppZh+Z_s@^xK*nLnTr<~s{@PrV{F}9z&{46Y ze0+RDVDlSy`1lU)0o(nD1%Q7K2OU!YejI{d2j4vme8n90OalIYp%CKiG>e;C{#>e^`jW&jYD2uVAuYA&+!b zRsZ_{6!@;0PQ z-Yy>GZM-;tck=*mqgUh-?*{Slc^*C;%NP2QD3FI%t1lWRS>~Y&??XYkV?-H*x1FBN z!$|PhiHrreBI3H+tAOoWWRPR>reZ(wQ|?&qr9f_rCs)FuO20u)ZMl$*MKxb@J|?lw z_jttQCg`yYo;Z#kONMIxMU+F2xvJ&ggRY~;AjB2EPByUhQT9?vNlUFD#($nkl!(( zvIg!|`p~akLWn{*#7;y0?X!H@DOK9W(Gnwjaul-k?!|qJiN^6?->SWt%Rs@LrK+}M z`X4?C;p4Nq#v7o73Iq>Q_K~N${-$ub{sgk=o$05gKQAF~$YKq3s9m=Sci#aYo% zw!o4L;Kg_KgtxkNv`mykI<%?2+bstlEB!qf%ZF?TV1(au#iQ66M%2kvS|qgTIl9mr zs+v27f@s@0D{x4Y01 z1@cv{6HaOq^5EHMH5Q2|safAh7&`!MvSW1BWOC<^sj=$ORd=Aesrh41!I_D_LN$kp zrg>xM2cuB==*$MVFQvuPy>=6kSve}7hw!?Jc$IRSVH8`G0?$y)AIpJmdy-wH7qNd5 zHPz}rfKR56fzgw`P(|vPX*Kttn%)?ZGeCsOdd0M(m2Lm#j(sG$gZl!JXais9vNL*L zK?av?xy`kMZewKzigTTe)LfJXqQ%m2*~kS2kNy=w%SN1L{RwE;OLS-cTu>Xj6ADcf z?ZrKShWXS@je!lTuh#b*tAxByi>m{zUFDj8U%{LMhF^|n%-+EftZ5#-NVCBzbMaU; zmoZ`>m<-cKaPLF&=bf}-yH*QUntkx}eGIw^MKu6!q<|XBrfywMe&}Goe_9Mn+D7b{ z!HKD?j#K8*e_);sG{y~bmJ7Z2kZ>+hGxatqJ_wkvFf7uHTV40E8YjJP81@;x%R6CIkqgBDXzK6w}{K}%Ts7Wp@|2~%=$W~7a| z^IeWlf*^6_7}d3_X<5)>+>$Jfd}cZBNH#`>Y1Ri6!nr;xpeyyomTe|vf9Apwh`F;1 z`3Q1bt-h)do@VG=$o?vE$w!{q{-D`>Ns8u;=f1Cnug!RFeZx~`=MYlvgN}R2R9Ux+ zT=R|ka%Q^>V)Sx-5vlbGxJdKG7d!Hw`aS{DvqJ5_3Br?=oW_bzx4+?sq&7vCiaH zN4UmOV6wE|z~ZvSnf2oK>WH5sa@ji{?lIxZI6QardbzmLPOf@AVUBPQ$=(?}1Qxf@ zbs$eeo6_Kc@A?#h@-iUlxOYShGc7rSv~x^Q(oQc|+qLTDj*_$wQ-7n~a?gT{-B5z# zmo>qum<3QFu~>#X>zDkcP6Fwbal=OfWdWl+97E&T)Hlq7nM?!QjS((J4(u`!wV|_U(3?qR;Ou(C+<2#@pS=2C+A+ghm&Kp%=f}Y7vXd4c6Y3pj# zg!@to3%tb(=RopYWAcf?x?{*4A;tGFBh(jZ4Qwa7M@rs@@P%D6r+T74r{Q+iNMkvB zODPC@+-8JpPOd9wTJ}P;4{sZX=yR0a+#@+<8*hMJ4qa5sacvE5lIQ;DYmb)HnE7t7 zm}E(4~KgFJ`&rC%WG74ZD zGOJ>pR4sZOvH@~+{gZyA^DXin*_`FGV2{_4Uh^d{H>2g3A@27g$0jcAv?%OwRX5JA z$@$RrH^NaySnl0E&gb}z$T^zp-?=SU@*oYB5%ig!ibjZjoA5PKYq*SrEJAeg zBC}o2gSg_N7TrzUCTtWbs0>J`Q72K1aMKZuddWylEfaDisg+qCnJ|YiS5>6lqE1%c zjgp>6TnliOVyU(~UDuyhg4B-p5bYJecL|jBQ51BCZiY7%z*T!GLE{6&9A6m;Jf$@h z8Mj)HkFxNN$8f+A*N@On%@yP;KweIMA?8>?*K^szh!^U&?+ZeOhL~$Hoy!F*coCCj z>1<6ti09a_3S%NS6@Ke-X!|lGjE`&4JJS5TY0!xrOKRO++qYvBUe~;dpiYLDuMQxw ze9}xta{tSaz_t!R802IciB1QT+)(a0n8+vWsXwLfq>8OQbU!EYF|*+hwq;Nyret36 zzVysfyv~RxUV5UP0Le<`Haj%_4bDHf(Xk&#o!l7gn8?Jsd{VT``*q+EsXvHhrKm+w zet}AXRV3rA;PVF0G6T{>%iCwmnIC03Rk>NlIx6Pr)Mp`GsF32Ar!gB>$*p*u9mHUV z9uiBdo3Q8j{2^Khw=r9$s37jwxR#vD^p-)`gpjP#MjdZ(jB)$#1-{UUVqvk<_lQ%fE3)h>#oHb9V%bZi!YfKm;oWpgx_;T+0 zpU%inZS2C|H10NNDIigII;9ZQs^*x?r*3l~TLhRBG~$NTZ_LCV{$9L$!Suo1C!1Pn zjDp#Md_aTXDGJj!j-l3FBFU@^%yxfY_m6QAIs=-e`D5)2E~NF0uB1#_7WPFJjFJIX z>F#cNp}KS3q((W9ZPh$DeU}rsdYzjo7MW#8{*1fnTI-`A#JZ2!DBS9Ce^FBzcmYwS zUG;?_aUL=4{t%kqXEe`9VfU1P^780%3nd!O!&$SXtw$w33iX52v#`fHU&gpkl-Q$f z7n+=Xi<`Tt-7#v>b|qRmiqzamQ;yo!J;IRLb9VUDQ7BNWYZLl}1Ai<5+O-!Pw<&Dt zE_AvlVQwNAemy$UV*| zp3+hHXNeMuZBX-aOIM?=6A%mcpU+NvO)Rpncr(D60 z#`2SyZGW;C*s9F6Zmmh5TMn1Snr^>kAMJ+`udiFGuwxVvgL3u{LF(s3U1VymlW{#) zk#eWs>WhwB8w#-+3z~-J9}Y6CR5<44j8Kx521S8-nh{VdzIG1b0d2Nud-rGDB{OiO z&-s$T3I(86PwO?ssZzpX3S!`Cwa$;sSlY~Onr!NNpgg0HlIF8DkbE@*KR<_s?`a_Y z<&_s_1^C8K0TTYG#agibL#R>C7YB}YkcpuE{`|T?z*+$-d@mNd?afSy^ILSj$Faau zK7A5~Id(nYkWkdHGtpH%?bsFE;0}L|T#xW;p1b`mZBCIxj&J|ojt@VSTOB(Q5=N+x`&Ymltv~!1WHfB*sl}yAJ7I^_fG1ZuF@u}{>y+>DFn8O+hS9%Gr0?8EM#Si-+9cyyTA5edd3Pc`*7S zFHtC5B8*)xpCIp*#B z>iTt4cWn67=UWsO1T_p`e!ZZKkBqyFc6q`?+k@hhsSiJVe($C5EjPxETn8PgcE}W@z!tLiE;XOs_PMx6KCkTY`1#pe*ZTQogf;W$sbF~ zt&e&^Eqiy;Rg@P`y}u34r?Tb}=iAjGPrnjJZapja`}~44j1LnZS-!wE%3i8rgkQDM zsF&FA*OKJ2U6`-v7Rbrr82hf`r}bn4(O;37KGT&6J42R%Ha$Ua5$oT6JF)V-v~=n( zG0Aq)Xszd1Uc{t&4)!$BA1~+WeTxy!THJV>8G{RN{C=0J$PoQ4T(f@nuTk(N5A&}; zXNYHF!!g*|J3gVrjj^uF7Sd;)@{GD7icdr(jT~eMxVNxV{UFWe41V4k@ z*YT0JnINuS?~W%n@JM8S64llpzGp&;ddH-XG^4fV|=6tS?si|7XSvi zKL;xvQ5hp>BR!vZR?oc3VUU~1zHEu=g=FRC6We=r9LS$7Nv+ZS{tX!T9LQzbjibVJ zU}RSbTHNwZM{m8{^kcTaE+aU!nOf=+bar;Ga%w(3&jBsRMJi7UaG(A(Bnw!!oagTz z)w1dDpU3cXHJRsEx_m9#1~moHW8n-Ny>pkiA9rP$k~J`Md&OqT?>b9(0#9S6fnv3VGZR-_G#8N2@+Q&9+q z<>Zb2^sG;Kr{zt|O$ zjEu>HZ(#;=toTU#E>O-kQu8IEAg(SZQ<3l5?iibX-AmbmC*!*e5?XZ4m>Jj zkPH0D?7Z34ui!2>&J1J!>7+8HxLe+iD9+d5)bO9l+QT>hcUc=Rbi2wsC-3Xm>o-BX zpO26K-$r)4zxw~r$9TSe4TBo=6VXmAnYeLcdcf0F_dw{5eU#qCbVdy<3laNb+H5&F6GinG7R6`}3im(A!fb-;NYrfF8N0*w|I$ZeD_SuK`pnev6XiN|g!k+#D zXCh=FYi=^isTg z4!mx;RNVbhHddt^h1o`)$=-a6F|u5`=yMbNMy9Z!5|M%t&J1ZYkGhS_{9>3reFVSx z0r{y9VsMpt(yvk520Up+ZbokBU=Ne!lO4=+uyV8=!b=!Z4A0_Q3DCE5+O@x89-51hyKu*~0m%-9?Aek=08+VILWu z)8CNpw@M)|LM(j8)@~k+e2Y;N*ZYp#{RzBtWBGV?NF_VFI1;H_nj>T(8Z6D__JAc- zQ4P76Lgap}yI;1k1|$m`mW}N>~2okrT%|JlL?K~uT7m~c}ecJ$qNQbXQtF>E^Z>YHp=xDu~RQJ6L>>Va? zN2(V^XYUL_dLjKzhUKSlpA;rr4N@?yroGs;z`C<&cfIXYCleO2S`;y6^vSD{e zGM>1}hHBLhAtU&~!g*t`LO53WN}3WOiitb(@{2Jih#pnGD4My&s2OKc6e;+lMZsLeK1zsg%aFKRgZ8;E1uxNX49*lSyQjHFPPCJ92fXNYHKg>v{133=b zWJVtQ_zQGog4m`6D=e%wJIRgFz?2OUhiw!*}W&IbdgT znaIWSl!H5bK-^MNdRz4(cQ0pQfP#)L zWJnz{)yPpQbsh$dCHiN|OqI+}L?O!dB40OqIw>LS=xkW#>HY32jDRtt7e>EI=6t!l z_FaN;dTtOoowJ#7Jz1M7i419saY^xENa%-v&u~reKsGuRi-4r)dfaY)s;pHOwhdl% zW~EG8CP0?bRacC`mNa`o*Yn3-@-wbzfumtyTO{|}S?Xk>>`H1>RylwxH;{=mMqx^a z9F6=b?79R%4K5>n+(S&zseR(kcfBcMDe66F$!?^tcmo@Q))gp#Q*^0aff4wfHclZt zec%cEsvQTZOAs#CxyiZBXp~u{732?pO@C{PzQpw{;x4$_7VG|a48|8=<b8VTK)qlByY@XxYx}7H_lcX!%SJ?Aaq39!1Dnk z7#e2nN0n6sX3*YpzStk(q9*PI9U`t6Y-{SfXp?m#;}<{p z=~%U81s_4MHQDA}9-6U2Em+R#2&)N$`r4)CaHYVi_>BJ6OpI-Rr9ET1^dIM}k~m0e z9k8YB7s~8Al4HOL6$OoXH<+W%&WZ@ND?o&3f5+geRChH|M*g z2@PvSPvu9!9`)deaYe47XehKeZwz0xvHt4kMn{f;>=daxdd`yu zM!xj3NgG@wJn+`B5(gOXw)o9st1xtcVUX*Oqk#$wxp#7?B3#3c18qve#34yxq}abZ zYzp9+c}REJhT#-WEZ$F8Z8^_5ECJI$XH%(yB%!duZ60+U@L;>G#$TG$j5+TJWMrHQ zV4)|}DJjsf3iQ;tdDIS1E{H>+;@R(G=NwY!xq@I^?$~SSwlsLcJgWM4{d@O+NSE8G zxN8MmY)*^H89=`9?w_SQy+g>N_uv^|uoQ#HfO}9#E+U4TGWXN9iN6c{)KtL}mU)Uk zOc~B^oJ+}bsuskohTBCuTUiL=n^@oJdE}aMlXcdoH%QX0(rC-Qy5})4c^@!XLdH#o z=Epo(y~JwBao2F=OjzvIU?TCF)o-`EDtknMx>W5ecBqm(Sgs!SLf!37neRHmIR+@v z{dYY;XB7ZXP*0#5-m={mnMjW7>b{d-jhHewATy*m3)VJhOEyDVS8j9sBN_`8PfXV$ zpXpG=#x;+V71g^dw5{f2HX+Fn^E{Y;M(kg z$3bz*J{=f?`QpXCTQ#dyZx{L?oS6g}H)=utN=#;^u;yiX2BLpCjadW|8&>|{%93n-Q}HMYcN{mL%FEa? z`>QsK)Z8%(mFj1wMn+`YTfMMMxJ~*$X-)TX>O{T#9k{ zc{ot-g)jB{a|~r3jYx{g-`QrY{>_#0sl+HS43t&SmEk2ZudH$!La(mp$Z%@d=>v@e zFE%ZiIDa|IHZ$@*vbt~7(1aX-3{-rv?hb?v*r?4a3{^J3{YRNOD|0=WD#+8x*`V!*4dcs8{$E(DaZ#6t&=5RCuPcaD^%;5O4o9P; zhX9#5`&O2u<403h;O6}me(i@R!5&yT2YkCv5AdxmdgHu-^$71$7=laU#RmY~=Hb(C z6h@rD|8k+lkHP+L%-^_!-odJVVtk8jnf1-l_3=hO5m59>E&r&&x8G03NA#R_P|DlZ z7>%?PiRXyhLOSYcK+?8)|3(<-tR9~nk1fRrO6jXTyWI%`1D+g^LK)h6Y9zhK?M`31}R)H`b5c+qB2q`h83PmtOnJ zP4WGOTpc)6WS~$_D+!kYD$aKg8N8X`We~l6eCEkPK>_;X0|)bpjUNu%IS9>Yj5tas zWoXYNO>PdvZP(u$t0Sn0cL%Yi2H%(o(m#pA&UGu+bOZ@Q z{}?2s#m%K@O&^}wAnBN=DYB)iM0fj2ouvo_ zvA<<}4&~Ce{sqwpspty@V#X&9%!`Qt;K~YK$;Hdqr(S+UI*$mZ4 zmqxk3Oi-~fYaNRaLF`P0x6-_eBIO*4xBt8x9jeu*)i{8p_yTO7v-e((B(7TG>Wqm%p{df zGT51%3}K@wxk0m^_=EPeuCadwdXd+5B1T(ZY>nCbI}}v5=v{ZVfyC$(89ThZ+N}Mr zJ5DAuU~`tfoMEv26-kN4NrWGwSJZSlGom3e^q$vOEKq^A(W7qR`~=Of_Y&+0xU%9n zv2LK7y+m-UdV|3y^j6+aZk|s(_lr{M1ROULwi{FsM$(}kqIV~sOwbPGg7g;uNC+OH z>06SY2ueC`cGh%mI}gq~{gRPz4)tvgpP8V&|LbpSj#QxQY(&-@hOAp@;nR`wj!m@E ze1V`zQ*I4n-_L2nba^6bxhi4zT}AcLnUWmmOIikgBbE1@ChZF}_pCAl4=RHZtO-hd z3$QNjCI#_d?RH@!(;*@=a+O2x; z_Ww=FJQ;)gW|EOINu$nwRq@MRYzD!Y4HSoijLk)rxaV31`Xd)oPhllb8#V0!)y7YU z0Rx`g6a>YHC@ljWD}u|QmnP1=!2Iam2dTaK|gv+`0QmVt@&{3(b)uxHyA-YI#Z7ZBAK*xa)lUYh5zw_`jsiuR`CzTFr)ZhI(A)#dw zw>31gAW9dB@&RVlEweeH>3p0&oLU&J(h{hDclPb=H+|#wtnF_DrSmy1Dq02r!rmKe z@EPi}XwRvtC+QlWGvZ9}y12GRI6Y*wy~UMnu#N_fK>ZeD9!2Bx^1QHzs2HWO7n z@y|E)zliMWm??rsGc8JiCZf$k#aF#7ZZdwKnK@rYUS-Hz<(*P$2cRKATk;n=Fncik zL|Wo1b0$?aUVEww;@pBw1i)dwG(&gktsM0rxpCB8ynDWZkTEd9*=LEmhR>#HotSBv zoH>6=@Wi>Z-Sqd$J>&DjAVdA6*`^u9-ca$;uI7;*lM7)0SSE!}1T;%kk37Nzu_$Bp z-hx-CK@%s7KSr;$dgIsKJkm7cBR>COY|I`s1MvsNaEimmeT)kvi#B*nnA5YGQm-^! zlYMHPnh)#JN#fC6NT-n(f-(o=7R#2U=Rxt!rGx0NN9`c1D zcSOPgmifCah;m@dlf6stkK*!hvpak$)FE(h9l?!$@1JRIFe>H$7vmd?%1n2uEBUI z>z%X<;5TL&CuV@=#mqIVw^ECw3|HQNOsPiTWl@KM+NM9BL4Y3pQ@{ULY5LZsae!+V z@5V_+`-|WL-jkcYK1lr!)&NAu@flzUH408Qq-Sbz{gr>W0b}tHL2L~>7*7^lO<^5P z!rd)0Ld1!b;fUE7YEg7Gexf=&2qM-)n)socW`i%5A%1VMigSG?U-0OspAM6{qGO0G@x^39diP)s>+jc-0qV$ z$7fcXDTZ1SJA;fbwl@59fC|rc9~3-sd?tdp`k>S*|4cU|j_HM8uQKEH3eaS2sy7Y~^fsQ2WP(6T0RPV}EF{U9~@4ngaoOFac24u~LbQP>N$dJ7RL)3>2%b zWpMwm-(Rm+$peZ5Xk67KfR$-dMY^HBJsUF0G3nuJlR8PWaXQd}A~(AJ{rk%`*f>DJ zsQsGCicT^boi#K}NuD@@!^%IP(FS#!)>BSu9)spfSqsPhES&a>k@cBW(9fxf zq_darT?^EW!-$sXEmzjZM*uA_UtXV>W0P!Y%my_78Q1WBjZq~Pm`XviFB{njX2Af(k2N|}=a{kvPv$znK9B`I@JRW zJ~|7?6XBUn?`zp!OL|gM_J)C4AVkJGU|Wp!z523{&2WHO`RU2>MQ6J=7W$9PMCA6i zSOuQ;ddF6jT~qX2`C4mSum&uyRtTMzjE;(*z7PDoNyotHD zN1FNuM-T_q&Ddz}2SE^k{>L%zt-4Na?>yp>0YR+7WD@8s<8wQMFV+ak9Jir9z;6F@ zBm5UREWJ+S0LCRZKYb!xtE{T*hWNg(HsC=LcKtS=K6B5M&Nl+hgev@0PAUjS=GM4 z!*swr=mG%fJQL^E1mMHuCU8D+r3ueWD?roOo(t|hZ_EZxDnN(k2iRXvci|8CkEB5N z|9m~77Xf$ zQ=?1#77viV>OWF~SzLzjOp9CSq+z`f-k%9;7&J9zkGR12g8)ZjVeL25oovhoL=te9 z-J8z^A(W-f(gBo)Jvn1Piaqd-2Aqaqg^S&r1mMj$hIj>cD9Cbq{_ZkR=Cz;1{0`Co zFiwb%ASgHc*9LR~zTY_J&jW&og4l((Z~o`HfaQOB+<(;Sd|k;TAGkXi%$qX;AzFKb1Tceb%?DtGpL`F-ypd_48_DvexX2^`2s6cPu_ zJm=xedICWIhJYAltxhznwslk`vre@DEmx71IWq!86+laWs|9o%|`_10j+5o_cdmJHW8jL#d7wR?>VvarPkg^2i7A1S9gX;T^xLq&@wX4 zc1`lPDNKI`odgkZum9_c!zf49-4T^{N| z`}4B%Im0z^r%~45BOfNKu4h!4^0;}UWJtp-)!T~q# z;Br_f)Yv7zii{;(({$9bl)G29UGNZ(7IF%MKK*LU1_;tl#p5CJS2ZZ$0^A$tM3RD> zX}G!#uy8W1z^^t&N_?jD{7HH1$~)FeE_+({^pCHJ!#V)L^!Y3OJ$k6r%5RKlclKXr zBRq^5`iM3^*8J3v%ic(c=-e8qxNH9t{qhm|ce8^6#}CiIx|pSalOgN@q6Cn=nq?li z^bd8soa;ZD5Rgjo5kOj}+%gsMRk=Ze)|5#XN$JlvG@W^clDn1r`Srwgg7^;tTm772 z#MzZHTx}SN*A7G#pAl|06QAzW29lA478s?Af3#5D=Z?tG`iT@8FM-AFvnK&CZl$$w z1@GFp@+R91kdyL80F~B}-F-^3<0F7vr`HPivDO{luVKsvqLhltfLHT9^PnBV#`llH zfBfV_@i#0w?7s`N*o zsVHn#p@&oe+AHVCjB$bhKfNe=ZEUVVYx2=YfW99(Ac1vQkP3Y#CGIE8M6d-J;1TDL zA7P7@L`JIw$kMGXU=x z;qe;>(3J_=Q&h{zB5d}=%IGlcF~A)vNwbt~3EhyxGs?}ASGD#5xgtpTlrACO{R6^f za~v-u0cv$Z%KiS+(?=IJUYMK!sAE6eaz^VKK^th(gk6Auj`94SY7!6vQYJS6tZ`|5 zuA#E-&G&B#Ew)ippU1^rfWSRg9X%O@v(whq!qPCLD4wOR1^ zS&?p_CH>tOehcMg%s`BYn5`Y~7|`Fi6}SgLVqoj@SC;^XBmwvup7%UN2kNZ=ARd== z+$2x7IA2H&t#PPv2Wlxzgx5KNZP_?YGl{WCYGFbP(xY^={RhCSw$34?;sDKhv}=c_ zNknP@h&1j1ZUKLgYx+8t3m9(1K{nNhdn89=d+g9%v41^5-6@TrD{5k>FX;$fYKrhL zkW!@_1h2Jswn3iMu0%9jK23UYyKJA7DnzxlkVJ~Q% z#@i|sz(;nS=gV=7n){&}5+WH6MUn*l;+n2KAky?E6{O6S3%$)}z^s603vNz5#q_DR zP=XcDG<>MD4=nD7J=Vp+9=D-()cyxM{B7$uhay1gNV&&u4Oe8HY_MDJ*svBUOC-I- z09m+NlAy4pD_#G9muv* z1BSe#`S?Do18))XwAlafI`4~5viTJkol(bM`XT)zAA7@00lM{#Y;xZ2XHigL%jN_cB8USG~D7R>kAqb>4pPP3s%g I*YCyuKbNP?0RR91 literal 0 HcmV?d00001 diff --git a/docs/tutorials/gluon/wrong_val.png b/docs/tutorials/gluon/wrong_val.png new file mode 100644 index 0000000000000000000000000000000000000000..8feed9034921df1d2bdcd8908543b35ad6568521 GIT binary patch literal 15438 zcmdtJcT|&U+b_zz4l<)6j1FRhpoU(RCN*M%fPjjE)Tm(SAcjsLV+BP(LJgs)1P}$0 z-pM$E5E6(4q$LoAKnyJs0wIvHADr*oXRq~s^X_y0J**`@33=*$UB7nUd3@2qT6)L9 z9TE}}(ombTmn9^&Z2;SSzibD7{}Q%S9r*8O#A)c2Uw|LvFFr}Y>tAo%xFRGZcJA5w z_Y-9zK>)mI5`OMlxKk)RJnF`sTN1%H!fywKh6nlIREoTH2jL$YqNJ~*r*m9O$uB(o zwyCb}|MLNz&^x}m(!nVr2?-?$=vk{P(fNxLF*J{9p?H~>sC4hBbo#~1m-l==`m$QN z0sSgK>#E(a0n)+6$@@}D_HfIxN2BgteA0OtcGfUCzS7h7nWWX8J-_{i_P|5lA{_f8 zZo!u@n}wT&as6$DY2D^aD;UJA=}7*3KB6eFzxgdOAMr2X>Lnz`-g(sP|JahyNPJ@X zW9z`_bD*t_KX$!%DYLb4>e0mksjUsAGq1jF+uC?``dOYUzjPflr%UCvvAuMq7E0{oYtsBm6X!nsybH_3@-ip8bY$|wFn%Ciw z%+AsU{mS}+@i~*j5&;Tw@k$>^K{5Jf<0O^bu`>8t3MT$0yefAr0=`xoz&xHzS~vxj zx7)C#`~pQ~al^0bKg>h3rGC!*NQ$&=pkqGw?{pI-R=ItsmVRDe_a<_?MBOfZcP|Pa zEd9*Dnz9W&nKN{Wavr@qd*}=${&TPQBVpawD#X%<*Rwx=zKy|AmF)vxa3RlzoG7$? zw)m1ox5ZsUFB;m&`A1oyoS2^_uKWUAV(V^n6$7v3Z_V_$eb*PR0A4LW|8ee=3-l!# zR+NvZ1@{HK=i<{1%1bDr{{78SuL}}m2eWwxn6zC`{nrID57(#^&IWXpFF%G-+I0)fZHXnec^E8Sc;z-bwJ`F;a+LZCYPY2gbr59-nc` z`FJfJJ9b6i-49-c9TUY+dh35RthLB6n8l9y!XtAq@m2ZK*WvMKa(pH-hXs$U$3$rB z7i5tZ7&yX*9Qax@#sUt=n}KdfceDNUsT$-O)^sq_r?t;+jV(p0@GT^V8`ZA#EuaVu z*o+Hl2DQ>F!D;6HT7&2D>M`oi)@-2C%zFKoi;Yf&Gj?7s&71ax@j;ZO#Y1U85lX%-MQzAT$ZC;4dZyglGG<>DC`xg2f%%+bCmc{35`u63H z1vzy;|KrnSf>15fhhNel9DqXE*a%6yEW`u$s}?T>^S>Se%a)-m>DgWxD5 zea&0$%KpK*$Zi557GggfpM{AxBXguMV@*DX5^ljm^Ts}fTUI{sJ|v;>*lgoeWS@g5 z#N(rHtgWd#eR^@vZ9bP*zX=9mWF(mmkIO_%E3x>BmD1awx+J4 z^jYi6Y&Kdj@rl%8!Yl>Hj9<9y3q7`itFIIkze}@gvAV1Zg{c;8mg8ELF}PN+eO6r- z?mc>Y-q@P-d{Vw@IUcvEoRTFzGVKSQaEiH7le!H&p@N?ftE0+7%%F}0h$rQDab{fv zni&d~&I~WbO~TjonI3{Ow$^;h-#VTdxKZ{%L8^TBGZYj!Se+h&fC7bZ7%t1nbj!-c(JPT*9EIdmYhBx?g9-RK*bjq>E zSQv9Q0j0-}kAIHjx4BQvs^M^LzlS+;czV#izlYSS+k0*mpJjh@6=`8Nw7aK>zT`-D z)Od)0Mt1=@^le}TmV&2(>BTohP?S6oB%ip?J@oi2$~@kd(m2!j!H(0tY5~nB7tLVf zE!4%Rm+iR4nKVKHX`#Enxr#z-R!d9RPvuW{CA(~gapZ9@B1~!WC@g03c{;g5r`)NKH#2`lO#NRo-Rvr_d5##-`Znt9zZ_sb@8AE7Y%(GTuN7=PEhH)lT?c9vr z66B%D(-HDxDrI1nxCj?n&Kt?b`$eD|k}dToi@fY~-ek>~Eyd}qPQE5Y?Z;M3_n1?s zJ`Kg0l{8FL#|&4$7D%q#gvJ-gpd2iX8~$u4Q1KTIZ7woUyY8S9GGkGOd{E*(+i`Q- z`OtDfgl1UlQ^#zS9VM@vMb{9fK3s5@>Fa?({)?gv^@ zgDpp)KU(`2VA}9bbL|eC#bSJRsBJCjcs4WcXCiE?0A5vpHWE_K+p(xG&CEG6u(_?Cd(G55uki(#m{E3v zz>zYDG|zlSf3#>|Pca4Vi6DlAp&j~aVTSo{TpZf*4ak|eRGgfxS}~nLRUx!peOY&C ztz>}3D`qQ%Qtx#e>3X$AU*&WaZ=O02AcQ7Mmfw0Nv4@_OdlO#U7xWafn8nmbcov$$ z`JxW%GWmuq4Vtv!Gs6mn+X8O2}r>mRKSz@9l$U|f zi?_Y^h}7J=528aN`n#goiA0#6ahD#G=3BmsQWkG}${?z))#D(gL_0LvIz%R&+lE+b z{1hjI0mpX zj5cTa>x=!(KsCRGibQH&*C_YarJ1ym{^ppO+17u>_%BBohT#cB6JDl3#w8Bp+U>V? zgFwf=Bx?}(mAw9N&La~#mZ7>1|r~tuVs38l|W_;o0*=7+~f%jcae?0j*0BEp$gxH|=I48}Gt%S-!N%5@@@QiA)z(6u|#z{eO{N$YA8TNzD#pffR zYuX><FlTADHYI7KPSTpD9 zNyQBOpZ$%CWng(;b>$4P{UUZ$4vl9Sjkhy2<%metEM8Oa8Ib%##l(=ej~2h@y03-7 z2|1fb2zQLFS8TQEzjC@43GiV%Q9o-h2co0xd@D0AXV}BVX$^Zb1RZ)b;yF$eOy}K2 zO7ijj%`TLkst|*GFumgX8GdW8|9p1S47T zCBu0Extfh>2k;c$@CczJ+&1OGq%q7{S+z0~s&w-&cl76V#W>Pl^kkCACGuHlq`n>H zv1E(HwO170PKj4)8XKoRD$5JKGhEJ`3xn{DO7Al?vv}Z@wRAi}oP|6@ivsuMRDD|j zqXu%Sw$+?IW27D6^h!{MAYP?h38KN0hf)Ps(Z~rmrGm5_ZS;ILn(cu6Fu|EwsVKh4my5+8Cn9zD zUws<{X-3tjsn2iVM@?ho(385(Ib8p{y@(g9jKO!F?g774KgkZ|Os8dPz?d}Vtl5m< z4HB;)9Tnn>8I$8g!B&FH?qp`>>`;&Jv>yw7((Eb2RV&Wt&E0o6}>>s;7srj0|~<&8i^Mn?O>KL|b-Ac~BSfuva# zPS+E1{F-N@ojYpFL8?*TjdH5i?d;eA!fPGJ{^Z70Bh!XTS4PqFk{j;w8u-&}uf zWxLRzijD#|UpH^+m!nJ?+s=dM0afS=dT5?CdYXEo=+Taz7POc4G@>LRE8fum=4~Ge z^lK@HnYI^PphFK4)Xn1P=M8V+)q6~M9T=gt=o&lSTsu4$d5hoRw_ME77$d!9tBhL*_3ywx7OtJ-LF2}^zpmU0m|Lz6A~uQDrA6{-dfqwJ^T2277m82-RKmf~wI_W!q5(Wzy#Ro9$+(F?Jo$i|(Jk?~g4h zl<@0K7|cxvv??iRClr#&eU1o$*Lm<5?1>nP-;zufD_(`(Fs@FHvnMz@^%jrvU$-Kq zs@dn#7G3fYA?b)ENhVF40n$EiyM|!WO4>7G|U9`p$<@<}4R zAyx3*4_)P9lzWsFmwnLJvKbNeNHEblTw{0QcaQO^87eak(JvQRq{~k&=|~Sx$7t)f zJ?DoF*tPrNtvV<|9=GcqCW1m~%`wi*b@o2EL@7E-_nE@@O{Q`_Tx-t{-|mtc#{lsy z)M5mti)Z-_cyXliE|Y7W?VPCQ1b;Zzdgak;mi=J|yNJtbE8Y=B*ayb5;+`6GboVNPM+;x zMyiW%W1P@mQi0kdA&otCZXN*NZraHg9a^=i##iuR1!b~LV~ffcBoNaKoWgua;< zMGfkhHwKxZQbJT=kM=S&W$bp8p+~TsndlsWrO8E+b&(V!F;%O(N;%%K!?p7Tp?bMm z&_uPEpXHE2Xhu?5DYmZVh>&cIf)=VRMZZ=`x&0`8uUKYA zS1>5oWv#5>2j>x^_BY#5&N69dts%E*p+*uwGY}?S7v&&|rwHcp9Oh7w z3ZLwP0jc6q`B<(#2Lm=?K9cXvL@Yt|%^WF3jI`EP21-CXLxFP&yc>o@k?(MONkR7n zQNjzNlXSZdMEj!wNk)F_dr|{h_d^tE;Rat=Su9wlIf}afn!e03o*CW#2dCLc?;{9M zuNq=B{!fUNX!oR;!B&jdhAnSE_-0Vz44?KD?S;m_%F9BEq%`#Q)jUb#S)3b)9xIQT zmP;|LD)HU}(m{P)zk*IP;iNqY$yY5%*U&SHL$lL47F^v-@ry5pP#3%LlBdCAUX*fz zuY8YjT(k$pswAOt&xkh%a_S`i>yE*dmH<}Ma2$ib;Gl6LGCH8SGUG%6W~lCOgLDd8n9l<6ws`n2!N5N<4hVZ;i@b04L1xbgg6suW>Od`%?v%=_icaCT^w zuv$<|ZVs;em-kE6m0|l{L;zZ(#V<2y(79cqtM3r|B3K>vqWz*s>H(%aI4lRZ*gHxt zQ!~}g#OCZ7u8z1|h?vpDG;{psw)}`~&-8A$*b6W>pJw}+xO?KWBINo#Bvsh9d6$!B z2Ma1<6=r_T2MfyppVi-5zz{dm@)kbqWPo7e3_)Ms(1So-rDQ|ozW$V$=8%%U-6eHa7oJ*C;kdmmD4?8&9S$9WH)zd zuDUN-Ue_xNi9tNv(E8UO!?)mouD_u0HUE(71nv2jkf&!=<>w4NF2Dd;M=?T&-E+Ux%phdFU+z_>O9K1-_b+yn80C)WQJOE##zq}{IgyET8r zxEJ4XNwk{6k=i^JUo4nEGz0T^bCuf{^aP`=SF1}}@YZh~AUOv=jxb@a-R_^bar~iP~nkQFZ;WXYi_*2;NYRu?U41DOaJeW2KeVOAXH_i4a#%H)O_< zKXQXVvdM?t1ij+L|G_B}%(n>|kJnn07M%34{_vp&kZoY# z^=I1!bN^@EhGV1hYe;~LK|HStt+bviLj~lGJ(1{RvXnX zRs`;ay)6?m1FM?BmkJC|8AA`*pUkTBH5G!}i!Uz^z}HGKefqV=q>3EO#8Llv zC{GoyCscQhJft5-AkM6I59HnY0T`+D#^Hr~UXJ zhHn*#bQ9xl6XPxuCK!Y3Lbn*-+Wg!StA$Dxm#Cq7VG+)veDRruZL94v7e$%)xherD zynlj~zezLUmEefrU^-6vnhw+BO&i!gojdKbVFqX=1Fgs1c6lnSP4qa_&5B}yew$tQ z1Xr(Yp~)Kc;ZK9@Z7D1@|9r4&xu-^ZF`6Y*j#m=TrxQTe4t2)ZPDLp9WpnrPK~|A= zQ}t-RBedlr#TLCm5FgdoSmnEb6+FId%AmQgamnC5+frU&fg2(AjCR)p|Jt6+xW0>c&_V*IATdQ>!3EIzQ+`Qj8(8_rSUFK z*cVha6sOrEb{o1%@$GNs55(e~ZjSE+#~<@=0#qyRSGbWwf>eBMr$KM2 zRQ4~$tcLf;Uk*8F__~l-G+ne+^Nj+;+`2%wTE^z}JkSO5GqCGko|$+|L{4afUZ)A~ znn_Xc>*2TG}O#J+&A_o;@0##su+i<)np)VEg`Va`#yx1UgOU z*zXHbUq$qNR*5qT@9%qN;7pN(uW9SM_mhlH)GB106REQr;SGhDN_7_FD8W;el?HFf zikZkr(T0NT#`WqKwxVlSF|Iao4c?G}sbuD3HCQoFlwqx0Uyu|ut~`A24E1^bP52^^ zb3n}qkbUiom318&&!9TzA7RaMTSyBL`kS}zB9R$5 z;a(^rgRR$NVqE?p_)^-5t0MI9Gp-E324&rG=Gjj`f_#uF8=>JeLtcg%0l<0dwV;cS zS09y{17MS0<1Df!h+o})#P<^60qnFL-Q5R=cK@Is38ii09EHEB=%UJhA}2_i&&h|p zHw7|Rq<~|pq29?1{cnt}3U$>K8@U2_z6@W(U@EUnaVAJ4jB9Ilzx&-)$K$n<kyVlKfUA{RctnWXYnn?IcN^3WJmEQ0@fqi) zDc%lmxQ9ztws2Z(8)`W?A@#^+ZYqB>8wX64oT$}FL2i#3a^quNt)V>VMyuf37!%q; zCksK_c&6<(XgXU!j@@gtE1^)2j%SF-&%fM_5X`zoHzaG`MFgnbS z^*dXcBUXa$EkHPFWO}uif98nqrD0z8p-Sj@JG^`P*u>0TCZx;SYo^wK$r z;QE)SK+aq!yufLtKc_BB02xk2i@;(c6z?NCNT||q%z9Vo2o_$UG=de5u#W1_XN6kn zxBf+X>j!Rk?}N0M43ms>ZwPy;oQaSEG1k^&AMtfF#~6SGp)JK#xk5hWO>?}G>nseu zmL+gfm{zucomPBT;iieQ^IDF>W<8VAdNbUM|7qlptV_u=kkb?;efReyqx8B;94cq) zkNb#_eiZffwv)_pk{x9?x+8r^{jn-^!}COYK28#CL|$nQ;Y3{;FPSuRqP)6EJnK(8 z=5KSNJp;!h_VCuw&e^~mmO9!JP5=73!#mIV8=^8C@0?J!@mCrY`=#(6y|3yS7$Q@1mWJsxcq= zW7Shch30AbU#dyPJIA>XPrU)2L)Lk0#kRWV^h;$D5{HVm)&kUax{w05tViNUM01bZZ+{|Q`7ncjf$QpT#%8R?>$iG#OkJCs zo{Xq^v{h|Ju2Oz>QCwbj-8D)ny1~ZFLfJya{~03WG42r9(+}?17ZkIipi=>+FZb=? zZ+rXR4!wZ^ZCsDwcb6ZmP{VKQ%dG1mtvwsUd0o!nHW~wjQPGYm0E^#z+G`K3O8xA^ zFIzY!))Y0tOP#6$>IIG&bzvH09<;@|qffrqDP!{w%&N`O?uz`n0rRsReP14w;#|4| zY6QvzQp1H_kvnJ6h91>#K^x-EU9r!G0HEQN0hE(A6b_fdj@<#$Biy#Spj*FpeGcFo z`?W^=5zwe$Qy&~{Ly?S`6hB19l|^!bpP#qcI7RW`7w&U$$TqX<_%v%Xt`fuMfD5%) zekJ3+Hl|jhko53YbVL0U;GGs*%2i2@FdkafBaSt2rxc@OUE<<~$G3{Jp7a6z z=xUtV@Z6hW;MIc7zAv7TJ4&nr#q#rHVg1_c&m6~D@i79~0~%%H);dnD z9rmhs6HOg__m&93;FAS0s1Wo>pmM~kHKWQB7wsHW%>``bwrgmsA6)b7r(o8=-&FJI z0wY~JBh|L@kk{NJwYe=nW!cUjzZ#hpFb zet*50Sr~Z7Es7RqEeneNa@XErr^MXYRTs}JY@rDPuE#EnTTrzW#24ttiHI0c# zkzj=^Gg}ZlS8jE zFS8`F>3oG!$CrCM-E_PqG;aR+$4^$sUiGq{!^njeh{0m(wB557g6|)!@|OlBUq#UV z(a=6?2`olUM2tv=r72oG(GidN7D+C5$-_MK+UaeT=rdHF-T(Q{=#50luqv}W$6wsM~OvNf>G}@$>j>y z+Fqj8r$@J~|8YUp+{+XZNc26C#?@(ExIFBO;3viZt|F3S0cU58KVCGjTX*iT9n#A%~< zh}!qpUnDB_+80GnJ(hD1ZHMxTk3Q^13Q#AF5VIsD%gu$_`zI%I92!F=?!_#NxzVCo zY^YZ_|K^nF;S_l?iXLxun0M}SsL|;4`$`UtC1N5>1YAbr%xJau{q;|0WTdCKhO@`J z5AP+l&b@wfFM9Od4J-5L?_X|?olkkZ$>p+AoLZqjRC!>qE3W1FmC&^FgCm!6`Av}zkG$V?>uHT5DnQ3j;gc_wjpaNmM~>xSSr z;{&Y=HAFWvv*7WE@?QHpi;x_sEE^a#tHY3+F!ivN%JsbG!--DBC)`ijz-fP-?HZZG z3l~H0pA4CdASmZBN~dRIJzFD#oh_-ROaC1v{yS3!bB{?WgkO}UM^C3*?fup5uOB5j z(HbQ)^>I9)e)f~OTF~le7nvV07Dh{$FUj%hwTJL%i1%#(1-1Oue8wFCfIV2Eic>tkY{ks-pwyG9D0{h5;bMlg5MV9gRPAZZ&WKip8LQECk$dBm~341N_JOv^lJgi{b*8}$!zXR;uh9g3Alrf%+f9PxIVetY~fQy+hPq-r`dn}n~TFn zi}sTB3q$I+ZXq65rHcE=JE!%zHHxhiQfb6T*8yBL13Z4qrJP%&+~bvK)dy1@CwH4L zp3%5B5*7B5Hb?LxxlY!~JFtDBdGnW86^H!MA+2HXKbM7#3iTVgz|1&_4D>H8+&p0l z;LYDUHLKe0$ii;F2YQWJ7`HvFC4Ttn0sUUGmK&`RC_Wk&pasuiUp#P+N)Bv0w87s9 zsXRZ~9w!UB{pU~Tavam}8qn#FDMWkY;q!P}!l8@t06{p)7Hx@+g?yrY_u}#C{lVmR z-lY352fh!u?DquKqQj)==uj->*0sLK??9=<%CZ6F0hH2C5x_kGS>~T#`wy004W3|$ z>W6{RxUMMPHvXt7^Z|F3w@t=emJ|n|)BWh>sV;jj_8sD_&BCY#yuuG98S&`fLUV%8 zA>pWL8!I| zhl`I_wQ+0*nl2Z{%wm*8(;)Hw&K!?BP}%NHyUKs(&GKYx-eAHemA-x1$jR(AuTz+@s z!<|9TmhfL^MEM5PaJl1Te~#gbii|CfVQ?sHP+M(t9=HH zM4Kz?2%amRql-nn_ify1PDwO?WQf7#h_Q(%xRev+Hpk88n;_gw?X;hm{o^DJ1wQ_*ED?gn{dVj;qm&-UZolqDQw;t3 z*F&I4AN;!a-pP>p12V05hj|}BBE2M_-xWKT-)U5bZ@#`m{T8)EGfA4haiv2nWXlT{ zq(I1v!z~v_|M+Peu%8nz4J9Zxti5lv1ki`t!7|~DF6!5*Chc#!C_YgsmHd;wZ-0>O zzDrgp;XEw)Bg^v8l+nl!iEq3HnR?s0wBu&~iZ;6x8?s|cyODMx$t+^-_s%UTKefrm zLx6%Or!F>7L#zbL-5{y3N}2krc9Q^w(87@upzg3kLidnn<)jgy6#<%0wDTSjhPicZ zL_m4eEZyKMbY>Asu`^8J4znO?_mmqu>h6DjN+6VA+3Ij(@&)%gze}=35yP&^Rb>EM zTD5ISudwD|4g>%Z8uwV5IUtEI<$=WO-N5HxSOD06AAsTx6Ut%>jx)~e+BE2`{rc$8 zZ0U4$)pT{gs$-#tL1j2dDe!GGeBjnJ;xx@Ne5KD=cYQ1zds_W4-ZKDM0b|G)<;1Nm z9O&%xKO^6`nh!xG{9qibwqA`lKZnsK_Xd;i%UI@O3y*uB*VENO)mb2z{nAs%Qcu3@ z8vcCkP(oc{xT}G)j(c$ZRaxHLXU(MUR0S+H0WdeorE=u7&6}}H&CGjEQS3>eqc+07 ze|djy$E?nO_ILjJfEtX{?)v>ut9j8|&(_%= z3jg{fpq`za-NM8YE*@d$LN*GbCe<~yGiQZ7y3ACDgQ z(e^O`L$PaW%TXCy0IuiyWLwNiUYnNiWfdX2#`C|bi!FPq|p$l3MBye#D_b9 ze%du9r5OF3z&{&P7{4|~3-{Q)O=`E+hO7S0sU{kBWh^>>Ro*KaaIS1f9ii$e;57hk zw3b_zf5-NRv9|Yep?y4&I&C!WcR(J71i&cE#p~BuPe5E$78K-nCCf{NnK1Va70zVu zWGBiUI@4+*Y<(UxmEbWoeWK%$U#h+FghA5arVR}y>(+fF@$jBb;2hqvTbe==vI*EE zv!v;c(W%&JU{EJAwVb7+{xLYg_0q*B_dw}7_0cG(BSjY zSky#BGb{Q6?HX-ZHzWfjhMd0DIjtz3bHVr8chAHC(rx?lbQ(mllSfCP-mm80{_9sO zj}VmNvf(M3uaw3yPn~Vscd4~305%5HnH7|#&`2nAeAJ+>lXEh4^QXX6MT^*SVw zeKK7FbheTMY^6&P2o;nH&7wkNi^4u009du%!29FVQlIL8rkg(^r~l&%PR%Zc{^*{I`TgDKG`Gvr)6jwcPKNqpSg z1QefaC*a1zm20j^%ju%Z7nioBJWf0Vg}Ctm{UEtAb;JNDwy`@00~G3JQVS9?qc*s#bAzd%&uQcXfIN5n(E`T8nH<AKGE;z#&T193E;B1q`SYOR zslW}u^50u$Lc}q$TRaC$P2A)ZZfh$3$FGY8!=u|rr@E3gvz=QO`x%6c`!Qj0T6OWu z7v+~P22_W~O#pU6x{+Z0+Ye_{mbmnU$_7AbU`={UMNJ!JPP9bqoT?vo(=?uFMlb

Xq;Az3(26gM;(w@9AlGPS-5C#g5&VQ|Z8$usp&iowaN|25F=k64 z>tF6`%>JCE2e>OG09h;VNB2BC@*`#goEPF0Z960cWeg|fsxbGK!~y>3{(8@`G?M*R z&iB4;OBKcZ%}LQu8BOgoKv5`mChyY@;Iv$738$N~Sjb@yL?6)KRqT-}SV%0;huWH; zup)(J{xlF6knLfSW4CUVr1TyzkG?HiM&_f3T6vVOU2&^(a-_hDr=hKj4N8`4qd;N< z2m)Q*+E?nn`C;c}yCX<_MpP|hAyCBiwuQa#{{wf`26JJtEI{RoHrcSQd?mZ72E$h} zfVGPW*#IJfS2rC#?Sc2@$#s?$M7j;^52gdAE(_}b!lUZ|Q7E+xJWEg~%Q6A%-#vGa1Ns)OJVm@+zdmUHtcyc_*TM(=(FRQ6TEudD?9&eDtK^PU zx;H9$-FPVXr1WY^hIXV_E&Mcs(u7lr-m>6W1jpU7D!Rvrt@pT_Q5Gn zpN|>&-8E1bj}Ppb0oE)G?dSywWPT+5(?2Ez(xX6nOpid{pMSW;I$7Y0m|I)_2lD@S fg7ELBV~DpM(0e;H(x2?OwGZ^1!`Zi|uiyJ$sh?eZ literal 0 HcmV?d00001 From 12914c5053270933b04a2fb6b6f6e24307b71e1e Mon Sep 17 00:00:00 2001 From: Holger Kohr Date: Tue, 6 Nov 2018 01:08:27 +0100 Subject: [PATCH 03/10] Gluon MNIST tutorial: Use modern Gluon constructs, fix some wordings --- docs/tutorials/gluon/mnist.md | 52 +++++++++++++++++++---------------- 1 file changed, 28 insertions(+), 24 deletions(-) diff --git a/docs/tutorials/gluon/mnist.md b/docs/tutorials/gluon/mnist.md index d7e11ff82f4a..1f438b0f3712 100644 --- a/docs/tutorials/gluon/mnist.md +++ b/docs/tutorials/gluon/mnist.md @@ -1,21 +1,18 @@ # Handwritten Digit Recognition -In this tutorial, we'll give you a step by step walk-through of how to build a hand-written digit classifier using the [MNIST](https://en.wikipedia.org/wiki/MNIST_database) dataset. +In this tutorial, we'll give you a step-by-step walkthrough of how to build a hand-written digit classifier using the [MNIST](https://en.wikipedia.org/wiki/MNIST_database) dataset. -MNIST is a widely used dataset for the hand-written digit classification task. It consists of 70,000 labeled 28x28 pixel grayscale images of hand-written digits. The dataset is split into 60,000 training images and 10,000 test images. There are 10 classes (one for each of the 10 digits). The task at hand is to train a model using the 60,000 training images and subsequently test its classification accuracy on the 10,000 test images. +MNIST is a widely used dataset for the hand-written digit classification task. It consists of 70,000 labeled 28x28 pixel grayscale images of hand-written digits. The dataset is split into 60,000 training images and 10,000 test images. There are 10 classes (one for each of the 10 digits). The task at hand is to train a model that can correctly classify the images into the digits they represent. The 60,000 training images are used to fit the model, and its performance in terms of classification accuracy is subsequently validated on the 10,000 test images. ![png](https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/example/mnist.png) **Figure 1:** Sample images from the MNIST dataset. -This tutorial uses MXNet's new high-level interface, gluon package to implement MLP using -imperative fashion. - -This is based on the Mnist tutorial with symbolic approach. You can find it [here](http://mxnet.io/tutorials/python/mnist.html). +This tutorial uses MXNet's high-level *Gluon* interface to implement neural networks in an imperative fashion. It is based on [the corresponding tutorial written with the symbolic approach](http://mxnet.io/tutorials/python/mnist.html). ## Prerequisites -To complete this tutorial, we need: +To complete this tutorial, you need: - MXNet. See the instructions for your operating system in [Setup and Installation](http://mxnet.io/install/index.html). - [Python Requests](http://docs.python-requests.org/en/master/) and [Jupyter Notebook](http://jupyter.org/index.html). @@ -28,28 +25,39 @@ $ pip install requests jupyter Before we define the model, let's first fetch the [MNIST](http://yann.lecun.com/exdb/mnist/) dataset. -The following source code downloads and loads the images and the corresponding labels into memory. +The following source code downloads the images and creates dataset objects `train_data` and `val_data` (for training and validation, respectively) that can be used to get one or several images at a time, together with their labels. We also add a `transform` function that rescales the images from `[0, 255]` to `[0, 1]`. ```python import mxnet as mx -# Fixing the random seed +# Select a fixed random seed for reproducibility mx.random.seed(42) -mnist = mx.test_utils.get_mnist() +train_data = mx.gluon.data.vision.MNIST( + train=True, + transform=lambda data, label: (data.astype("float32") / 255, label), +) +val_data = mx.gluon.data.vision.MNIST( + train=False, + transform=lambda data, label: (data.astype("float32") / 255, label), +) ``` -After running the above source code, the entire MNIST dataset should be fully loaded into memory. Note that for large datasets it is not feasible to pre-load the entire dataset first like we did here. What is needed is a mechanism by which we can quickly and efficiently stream data directly from the source. MXNet Data iterators come to the rescue here by providing exactly that. Data iterator is the mechanism by which we feed input data into an MXNet training algorithm and they are very simple to initialize and use and are optimized for speed. During training, we typically process training samples in small batches and over the entire training lifetime will end up processing each training example multiple times. In this tutorial, we'll configure the data iterator to feed examples in batches of 100. Keep in mind that each example is a 28x28 grayscale image and the corresponding label. +Since the MNIST dataset is relatively small, this class loads it into memory at once, but for larger datasets like ImageNet, this would no longer be possible. The Gluon `Dataset` class from which `MNIST` derives supports both cases. +In general, `Dataset` and `DataLoader` (which we'll see in a second) are the machinery in MXNet which provides a stream of input data that is consumed by a training algorithm, typically in batches of multiple data entities at once for better efficiency. +In this tutorial, we'll configure the data loader to feed examples in batches of 100. + +Image batches are commonly represented by a 4-D array with shape `(batch_size, num_channels, height, width)`. This convention is denoted by "BCHW", and it is the default in MXNet. For the MNIST dataset, each image has a size of 28x28 pixels and one color channel (grayscale), hence the shape of an input batch will be `(batch_size, 1, 28, 28)`. -Image batches are commonly represented by a 4-D array with shape `(batch_size, num_channels, width, height)`. For the MNIST dataset, since the images are grayscale, there is only one color channel. Also, the images are 28x28 pixels, and so each image has width and height equal to 28. Therefore, the shape of input is `(batch_size, 1, 28, 28)`. Another important consideration is the order of input samples. When feeding training examples, it is critical that we don't feed samples with the same label in succession. Doing so can slow down training. +Another important consideration is the order of input samples. When feeding training examples, it is critical that we don't feed samples with the same label in succession. Doing so can slow down training. Data iterators take care of this by randomly shuffling the inputs. Note that we only need to shuffle the training data. The order does not matter for test data. -The following source code initializes the data iterators for the MNIST dataset. Note that we initialize two iterators: one for train data and one for test data. +The following code initializes the data iterators for the MNIST dataset. ```python batch_size = 100 -train_data = mx.io.NDArrayIter(mnist['train_data'], mnist['train_label'], batch_size, shuffle=True) -val_data = mx.io.NDArrayIter(mnist['test_data'], mnist['test_label'], batch_size) +train_loader = mx.gluon.data.DataLoader(train_data, shuffle=True, batch_size=batch_size) +val_loader = mx.gluon.data.DataLoader(val_data, shuffle=False, batch_size=batch_size) ``` ## Approaches @@ -61,26 +69,22 @@ Now, let's import required nn modules ```python from __future__ import print_function import mxnet as mx -from mxnet import gluon +from mxnet import gluon, autograd from mxnet.gluon import nn -from mxnet import autograd as ag ``` ### Define a network: Multilayer Perceptron The first approach makes use of a [Multilayer Perceptron](https://en.wikipedia.org/wiki/Multilayer_perceptron) to solve this problem. We'll define the MLP using MXNet's imperative approach. -MLPs consist of several fully connected layers. A fully connected layer or FC layer for short, is one where each neuron in the layer is connected to every neuron in its preceding layer. From a linear algebra perspective, an FC layer applies an [affine transform](https://en.wikipedia.org/wiki/Affine_transformation) to the *n x m* input matrix *X* and outputs a matrix *Y* of size *n x k*, where *k* is the number of neurons in the FC layer. *k* is also referred to as the hidden size. The output *Y* is computed according to the equation *Y = W X + b*. The FC layer has two learnable parameters, the *m x k* weight matrix *W* and the *m x 1* bias vector *b*. - -In an MLP, the outputs of most FC layers are fed into an activation function, which applies an element-wise non-linearity. This step is critical and it gives neural networks the ability to classify inputs that are not linearly separable. Common choices for activation functions are sigmoid, tanh, and [rectified linear unit](https://en.wikipedia.org/wiki/Rectifier_%28neural_networks%29) (ReLU). In this example, we'll use the ReLU activation function which has several desirable properties and is typically considered a default choice. +MLPs consist of several fully connected layers. A fully connected layer or FC layer for short, is one where each neuron in the layer is connected to every neuron in its preceding layer. From a linear algebra perspective, an FC layer applies an [affine transform](https://en.wikipedia.org/wiki/Affine_transformation) to the *n x m* input matrix *X* and outputs a matrix *Y* of size *n x k*, where *k* is the number of neurons in the FC layer. *k* is also referred to as the hidden size. The output *Y* is computed according to the equation *Y = X W + b*. The FC layer has two learnable parameters, the *m x k* weight matrix *W* and the *1 x k* bias vector *b*. -The following code declares three fully connected layers with 128, 64 and 10 neurons each. -The last fully connected layer often has its hidden size equal to the number of output classes in the dataset. Furthermore, these FC layers uses ReLU activation for performing an element-wise ReLU transformation on the FC layer output. +In an MLP, the outputs of FC layers are typically fed into an activation function that applies an elementwise nonlinearity. This step is critical since it gives neural networks the ability to classify inputs that are not linearly separable. Common choices for activation functions are sigmoid, tanh, and [rectified linear unit](https://en.wikipedia.org/wiki/Rectifier_%28neural_networks%29) (ReLU). In this example, we'll use the ReLU activation function since it has several nice properties that make it a good default choice. -To do this, we will use [Sequential layer](http://mxnet.io/api/python/gluon/gluon.html#mxnet.gluon.nn.Sequential) type. This is simply a linear stack of neural network layers. `nn.Dense` layers are nothing but the fully connected layers we discussed above. +The following code declares three fully connected (or *dense*) layers with 128, 64 and 10 neurons each, where the last number of neurons matches the number of output classes in our dataset. +To build the neural network, We use a [`Sequential` layer](http://mxnet.io/api/python/gluon/gluon.html#mxnet.gluon.nn.Sequential), which is a convenience class to build a simple linear stack of layers (often called a *feed-forward neural net*). ```python -# define network net = nn.Sequential() with net.name_scope(): net.add(nn.Dense(128, activation='relu')) From 286ed28600c898515e8c5fc395fc97d4a1eba2a8 Mon Sep 17 00:00:00 2001 From: Holger Kohr Date: Tue, 27 Nov 2018 00:34:19 +0100 Subject: [PATCH 04/10] [Gluon] Move to data loaders and improve wording in MNIST tutorial --- docs/tutorials/gluon/mnist.md | 203 ++++++++++++++++++++-------------- 1 file changed, 120 insertions(+), 83 deletions(-) diff --git a/docs/tutorials/gluon/mnist.md b/docs/tutorials/gluon/mnist.md index 1f438b0f3712..f0f6cda6a423 100644 --- a/docs/tutorials/gluon/mnist.md +++ b/docs/tutorials/gluon/mnist.md @@ -1,21 +1,22 @@ -# Handwritten Digit Recognition +# Hand-written Digit Recognition -In this tutorial, we'll give you a step-by-step walkthrough of how to build a hand-written digit classifier using the [MNIST](https://en.wikipedia.org/wiki/MNIST_database) dataset. +In this tutorial, we'll give you a step-by-step walkthrough of building a hand-written digit classifier using the [MNIST](https://en.wikipedia.org/wiki/MNIST_database) dataset. -MNIST is a widely used dataset for the hand-written digit classification task. It consists of 70,000 labeled 28x28 pixel grayscale images of hand-written digits. The dataset is split into 60,000 training images and 10,000 test images. There are 10 classes (one for each of the 10 digits). The task at hand is to train a model that can correctly classify the images into the digits they represent. The 60,000 training images are used to fit the model, and its performance in terms of classification accuracy is subsequently validated on the 10,000 test images. +MNIST is a widely used dataset for the hand-written digit classification task. It consists of 70,000 labeled grayscale images of hand-written digits, each 28x28 pixels in size. The dataset is split into 60,000 training images and 10,000 test images. There are 10 classes (one for each of the 10 digits). The task at hand is to train a model that can correctly classify the images into the digits they represent. The 60,000 training images are used to fit the model, and its performance in terms of classification accuracy is subsequently validated on the 10,000 test images. ![png](https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/example/mnist.png) **Figure 1:** Sample images from the MNIST dataset. -This tutorial uses MXNet's high-level *Gluon* interface to implement neural networks in an imperative fashion. It is based on [the corresponding tutorial written with the symbolic approach](http://mxnet.io/tutorials/python/mnist.html). +This tutorial uses MXNet's high-level *Gluon* interface to implement neural networks in an imperative fashion. It is based on [the corresponding tutorial written with the symbolic approach](https://mxnet.io/tutorials/python/mnist.html). ## Prerequisites To complete this tutorial, you need: -- MXNet. See the instructions for your operating system in [Setup and Installation](http://mxnet.io/install/index.html). -- [Python Requests](http://docs.python-requests.org/en/master/) and [Jupyter Notebook](http://jupyter.org/index.html). +- MXNet. See the instructions for your operating system in [Setup and Installation](https://mxnet.io/install/index.html). +- The Python [`requests`](http://docs.python-requests.org/en/master/) library. +- (Optional) The [Jupyter Notebook](https://jupyter.org/index.html) software for interactively running the provided `.ipynb` file. ``` $ pip install requests jupyter @@ -23,9 +24,10 @@ $ pip install requests jupyter ## Loading Data -Before we define the model, let's first fetch the [MNIST](http://yann.lecun.com/exdb/mnist/) dataset. +The following code downloads the MNIST dataset to the default location (`.mxnet/datasets/mnist/` in your home directory) and creates `Dataset` objects `train_data` and `val_data` for training and validation, respectively. +These objects can later be used to get one image or a batch of images at a time, together with their corresponding labels. -The following source code downloads the images and creates dataset objects `train_data` and `val_data` (for training and validation, respectively) that can be used to get one or several images at a time, together with their labels. We also add a `transform` function that rescales the images from `[0, 255]` to `[0, 1]`. +We also add a `transform` function that rescales the images from `[0, 255]` to `[0, 1]`. ```python import mxnet as mx @@ -43,14 +45,19 @@ val_data = mx.gluon.data.vision.MNIST( ) ``` -Since the MNIST dataset is relatively small, this class loads it into memory at once, but for larger datasets like ImageNet, this would no longer be possible. The Gluon `Dataset` class from which `MNIST` derives supports both cases. -In general, `Dataset` and `DataLoader` (which we'll see in a second) are the machinery in MXNet which provides a stream of input data that is consumed by a training algorithm, typically in batches of multiple data entities at once for better efficiency. -In this tutorial, we'll configure the data loader to feed examples in batches of 100. +Since the MNIST dataset is relatively small, this class loads it into memory all at once, but for larger datasets like ImageNet, this would no longer be possible. +The Gluon `Dataset` class from which `MNIST` derives supports both cases. +In general, `Dataset` and `DataLoader` (which we'll see in a second) are the machinery in MXNet that provides a stream of input data to be consumed by a training algorithm, typically in batches of multiple data entities at once for better efficiency. +In this tutorial, we will configure the data loader to feed examples in batches of 100. -Image batches are commonly represented by a 4-D array with shape `(batch_size, num_channels, height, width)`. This convention is denoted by "BCHW", and it is the default in MXNet. For the MNIST dataset, each image has a size of 28x28 pixels and one color channel (grayscale), hence the shape of an input batch will be `(batch_size, 1, 28, 28)`. +An image batch is commonly represented by a 4-D array with shape `(batch_size, num_channels, height, width)`. +This convention is denoted by "BCHW", and it is the default in MXNet. +For the MNIST dataset, each image has a size of 28x28 pixels and one color channel (grayscale), hence the shape of an input batch will be `(batch_size, 1, 28, 28)`. -Another important consideration is the order of input samples. When feeding training examples, it is critical that we don't feed samples with the same label in succession. Doing so can slow down training. -Data iterators take care of this by randomly shuffling the inputs. Note that we only need to shuffle the training data. The order does not matter for test data. +Another important consideration is the order of input samples. +When feeding training examples, it is critical not feed samples with the same label in succession since doing so can slow down training. +Data iterators take care of this issue by randomly shuffling the inputs. +Note that we only need to shuffle the training data -- for validation data, the order does not matter. The following code initializes the data iterators for the MNIST dataset. @@ -62,27 +69,35 @@ val_loader = mx.gluon.data.DataLoader(val_data, shuffle=False, batch_size=batch_ ## Approaches -We will cover a couple of approaches for performing the hand written digit recognition task. The first approach makes use of a traditional deep neural network architecture called Multilayer Perceptron (MLP). We'll discuss its drawbacks and use that as a motivation to introduce a second more advanced approach called Convolution Neural Network (CNN) that has proven to work very well for image classification tasks. +We will cover a couple of approaches for performing the hand-written digit recognition task. +In our first attempt, we will make use of a traditional deep neural network architecture called Multilayer Perceptron (MLP). +Although this architecture gets us up to about 95.5 % accuracy on the validation set, we will recognize and discuss some of its drawbacks and use them as a motivation for using a different network. +In that second attempt, we introduce the more advanced and very widely used Convolutional Neural Network (CNN) architecture that has proven to work very well for image classification tasks. -Now, let's import required nn modules +As a first step, we do some convenience imports of frequently used modules. ```python -from __future__ import print_function +from __future__ import print_function # only relevant for Python 2 import mxnet as mx -from mxnet import gluon, autograd +from mxnet import nd, gluon, autograd from mxnet.gluon import nn ``` ### Define a network: Multilayer Perceptron -The first approach makes use of a [Multilayer Perceptron](https://en.wikipedia.org/wiki/Multilayer_perceptron) to solve this problem. We'll define the MLP using MXNet's imperative approach. +MLPs consist of several fully connected layers. +In a fully connected (short: FC) layer, each neuron is connected to every neuron in its preceding layer. +From a linear algebra perspective, an FC layer applies an [affine transform](https://en.wikipedia.org/wiki/Affine_transformation) *Y = X W + b* to an input matrix *X* of size (*n x m*) and outputs a matrix *Y* of size (*n x k*). +The number *k*, also referred to as *hidden size*, corresponds to the number of neurons in the FC layer. +An FC layer has two learnable parameters: the (*m x k*) weight matrix *W* and the (*1 x k*) bias vector *b*. -MLPs consist of several fully connected layers. A fully connected layer or FC layer for short, is one where each neuron in the layer is connected to every neuron in its preceding layer. From a linear algebra perspective, an FC layer applies an [affine transform](https://en.wikipedia.org/wiki/Affine_transformation) to the *n x m* input matrix *X* and outputs a matrix *Y* of size *n x k*, where *k* is the number of neurons in the FC layer. *k* is also referred to as the hidden size. The output *Y* is computed according to the equation *Y = X W + b*. The FC layer has two learnable parameters, the *m x k* weight matrix *W* and the *1 x k* bias vector *b*. - -In an MLP, the outputs of FC layers are typically fed into an activation function that applies an elementwise nonlinearity. This step is critical since it gives neural networks the ability to classify inputs that are not linearly separable. Common choices for activation functions are sigmoid, tanh, and [rectified linear unit](https://en.wikipedia.org/wiki/Rectifier_%28neural_networks%29) (ReLU). In this example, we'll use the ReLU activation function since it has several nice properties that make it a good default choice. +In an MLP, the outputs of FC layers are typically fed into an activation function that applies an elementwise nonlinearity. +This step is crucial since it gives neural networks the ability to classify inputs that are not linearly separable. +Common choices for activation functions are [sigmoid](https://en.wikipedia.org/wiki/Sigmoid_function), [hyperbolic tangent ("tanh")](https://en.wikipedia.org/wiki/Hyperbolic_function#Definitions), and [rectified linear unit (ReLU)](https://en.wikipedia.org/wiki/Rectifier_(neural_networks)). +In this example, we'll use the ReLU activation function since it has several nice properties that make it a good default choice. The following code declares three fully connected (or *dense*) layers with 128, 64 and 10 neurons each, where the last number of neurons matches the number of output classes in our dataset. -To build the neural network, We use a [`Sequential` layer](http://mxnet.io/api/python/gluon/gluon.html#mxnet.gluon.nn.Sequential), which is a convenience class to build a simple linear stack of layers (often called a *feed-forward neural net*). +To build the neural network, we use a [`Sequential` layer](https://mxnet.io/api/python/gluon/gluon.html#mxnet.gluon.nn.Sequential), which is a convenience class to build a linear stack of layers, often called a *feed-forward neural net*. ```python net = nn.Sequential() @@ -94,69 +109,88 @@ with net.name_scope(): #### Initialize parameters and optimizer -The following source code initializes all parameters received from parameter dict using [Xavier](http://mxnet.io/api/python/optimization/optimization.html#mxnet.initializer.Xavier) initializer -to train the MLP network we defined above. - -For our training, we will make use of the stochastic gradient descent (SGD) optimizer. In particular, we'll be using mini-batch SGD. Standard SGD processes train data one example at a time. In practice, this is very slow and one can speed up the process by processing examples in small batches. In this case, our batch size will be 100, which is a reasonable choice. Another parameter we select here is the learning rate, which controls the step size the optimizer takes in search of a solution. We'll pick a learning rate of 0.02, again a reasonable choice. Settings such as batch size and learning rate are what are usually referred to as hyper-parameters. What values we give them can have a great impact on training performance. +Before the network can be used, its parameters (weight and bias) need to be set to initial values that are sufficiently random while keeping the magnitude of gradients limited. +The [Xavier](https://mxnet.io/api/python/optimization/optimization.html#mxnet.initializer.Xavier) initializer is usually a good default choice. -We will use [Trainer](http://mxnet.io/api/python/gluon/gluon.html#trainer) class to apply the -[SGD optimizer](http://mxnet.io/api/python/optimization/optimization.html#mxnet.optimizer.SGD) on the -initialized parameters. +Since the `net.initialize()` method creates arrays for its parameters, it needs to know where to store the values: in CPU or GPU memory. +Like many other functions and classes that deal with memory management in one way or another, it takes an optional `ctx` (short for *context*) argument, where the return value of either `mx.cpu()` or `mx.gpu()` can be provided. ```python ctx = mx.gpu(0) if mx.context.num_gpus() > 0 else mx.cpu(0) -net.initialize(mx.init.Xavier(magnitude=2.24), ctx=ctx) -trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': 0.02}) +net.initialize(mx.init.Xavier(), ctx=ctx) ``` -#### Train the network +To train the network parameters, we will make use of the [stochastic gradient descent (SGD)](https://en.wikipedia.org/wiki/Stochastic_gradient_descent) optimizer. +More specifically, we use mini-batch SGD in contrast to the classical SGD that processes one example at a time, which is very slow in practice. +(Recall that we set the batch size to 100 in the ["Loading Data"](#loading-data) part.) -Typically, one runs the training until convergence, which means that we have learned a good set of model parameters (weights + biases) from the train data. For the purpose of this tutorial, we'll run training for 10 epochs and stop. An epoch is one full pass over the entire train data. +Besides the batch size, the SGD algorithm has one important *hyperparameter*: the *learning rate*. +It determines the size of steps that the algorithm takes in search of parameters that allow the network to optimally fit the training data, and as such it has great influence on both the course of the training process and its final outcome. +In general, hyperparameters refer to *non-learnable* values that need to be chosen before training and that have an effect on the outcome. +In this example, further hyperparameters are the number of layers in the network, the number of neurons of the first two layers, the activation function and (later) the loss function. -We will take following steps for training: +The SGD optimization method can be accessed in MXNet Gluon through the [`Trainer`](https://mxnet.io/api/python/gluon/gluon.html#trainer) class. +Internally, it makes use of the [`SGD`](https://mxnet.io/api/python/optimization/optimization.html#mxnet.optimizer.SGD) optimizer class. -- Define [Accuracy evaluation metric](http://mxnet.io/api/python/metric/metric.html#mxnet.metric.Accuracy) over training data. -- Loop over inputs for every epoch. -- Forward input through network to get output. -- Compute loss with output and label inside record scope. -- Backprop gradient inside record scope. -- Update evaluation metric and parameters with gradient descent. +```python +trainer = gluon.Trainer( + params=net.collect_params(), + optimizer='sgd', + optimizer_params={'learning_rate': 0.02}, +) +``` -Loss function takes (output, label) pairs and computes a scalar loss for each sample in the mini-batch. The scalars measure how far each output is from the label. -There are many predefined loss functions in gluon.loss. Here we use -[softmax_cross_entropy_loss](http://mxnet.io/api/python/gluon/gluon.html#mxnet.gluon.loss.softmax_cross_entropy_loss) for digit classification. We will compute loss and do backward propagation inside -training scope which is defined by `autograd.record()`. +#### Train the network + +Training the network requires a way to tell how well the network currently fits the training data, or how badly, expressed as a "loss" value, as it is customary in optimization. +Ideally, in a classification task, we would use the prediction inaccuracy, i.e., the fraction of incorrectly classified samples, to guide the training to a lower value. +Unfortunately, inaccuracy is a poor choice for training since it contains almost no information that can be used to update the network parameters (its gradient is zero almost everywhere). + +As a better behaved proxy for inaccuracy, the [softmax cross-entropy loss](https://mxnet.incubator.apache.org/api/python/gluon/loss.html#mxnet.gluon.loss.SoftmaxCrossEntropyLoss) is a popular choice. +It has the essential property of being minimal for the correct prediction, but at the same time, it is everywhere differentiable with nonzero gradient. + +We only use [accuracy](https://mxnet.incubator.apache.org/api/python/metric/metric.html#mxnet.metric.Accuracy) to monitor the training progress, since it is more intuitively interpretable. ```python -%%time +metric = mx.metric.Accuracy() +loss_function = gluon.loss.SoftmaxCrossEntropyLoss() +``` +Typically, the training is run until convergence, which means that further iterations will not improve the result any more, and that the network has probably learned a good set of model parameters from the train data. +For the purpose of this tutorial, we only loop 10 times over the entire dataset; one such pass over the data is usually called an *epoch*. + +The following steps are taken in each `epoch`: + +- Get a minibatch of `inputs` and `labels` from the `train_loader`. +- Feed the `inputs` to the network, producing `outputs`. +- Compute the minibatch loss value by comparing `outputs` to `labels`. +- Backpropagate the gradients to update the network parameters. +- Print the current accuracy over the training data, i.e., the fraction of correctly classified training examples. + +```python num_epochs = 10 -metric = mx.metric.Accuracy() -softmax_cross_entropy_loss = gluon.loss.SoftmaxCrossEntropyLoss() -for epoch in range(num_epochs): - # Restart the training data iterator at the beginning of each epoch - train_data.reset() - for batch in train_data: - # Possibly copy data and labels to the GPU - data = batch.data[0].copyto(ctx) - labels = batch.label[0].copyto(ctx) +for epoch in range(num_epochs): + for inputs, labels in train_loader: + # Possibly copy inputs and labels to the GPU + inputs = inputs.as_in_context(ctx) + labels = labels.as_in_context(ctx) # The forward pass and the loss computation need to be wrapped - # in an `ag.record()` scope to indicate that the results will + # in a `record()` scope to indicate that the results will # be needed in the backward pass (gradient computation). - with ag.record(): - out = net(data) - loss = softmax_cross_entropy_loss(out, labels) + with autograd.record(): + outputs = net(inputs) + loss = loss_function(outputs, labels) # Compute gradients by backpropagation and update the evaluation # metric loss.backward() - metric.update(labels, out) + metric.update(labels, outputs) # Update the parameters by stepping the trainer; the batch size # is required to normalize the gradients by `1 / batch_size`. - trainer.step(batch_size=batch.data[0].shape[0]) + trainer.step(batch_size=inputs.shape[0]) # Print the evaluation metric and reset it for the next epoch name, acc = metric.get() @@ -166,39 +200,41 @@ for epoch in range(num_epochs): #### Prediction -After the above training completes, we can evaluate the trained model by running predictions on validation dataset. Since the dataset also has labels for all test images, we can compute the accuracy metric over validation data as follows: +When the above training has completed, we can evaluate the trained model by running predictions on validation dataset. +Since the dataset also has labels for all test images, we can compute the accuracy metric over validation data as follows: ```python metric = mx.metric.Accuracy() -val_data.reset() -for batch in val_data: - # Possibly copy data and labels to the GPU - data = batch.data[0].copyto(ctx) - labels = batch.label[0].copyto(ctx) - metric.update(labels, net(data)) +for inputs, labels in val_loader: + # Possibly copy inputs and labels to the GPU + inputs = inputs.as_in_context(ctx) + labels = labels.as_in_context(ctx) + metric.update(labels, net(inputs)) print('Validaton: {} = {}'.format(*metric.get())) assert metric.get()[1] > 0.94 ``` -If everything went well, we should see an accuracy value that is around 0.96, which means that we are able to accurately predict the digit in 96% of test images. This is a pretty good result. But as we will see in the next part of this tutorial, we can do a lot better than that. +If everything went well, we should see an accuracy value that is around 0.954, which means that we are able to accurately predict the digit in 95.5 % of test images. +This is a pretty good result, but as we will see in the next part of this tutorial, we can do a lot better than that. -That said, a single numer only gives us very limited information on the performance of our neural network. It is always a good idea to actually look at the images on which the network performed poorly, and check for clues on how to improve the performance. We do that with the help of a small function that produces a list of the images where the network got it wrong, together with the predicted and true labels: +That said, a single number only conveys very limited information on the performance of our neural network. +It is always a good idea to actually look at the images on which the network performed poorly, and check for clues on how to improve the performance. +We do that with the help of a small function that produces a list of the images which the network got wrong, together with the predicted and true labels. ```python -def get_mislabelled(it): +def get_mislabelled(loader): """Return list of ``(input, pred_lbl, true_lbl)`` for mislabelled samples.""" mislabelled = [] - it.reset() - for batch in it: - data = batch.data[0].copyto(ctx) - labels = batch.label[0].copyto(ctx) - out = net(data) + for inputs, labels in loader: + inputs = inputs.as_in_context(ctx) + labels = labels.as_in_context(ctx) + outputs = net(inputs) # Predicted label is the index is where the output is maximal - preds = nd.argmax(out, axis=1) - for d, p, l in zip(data, preds, labels): + preds = nd.argmax(outputs, axis=1) + for i, p, l in zip(inputs, preds, labels): if p != l: mislabelled.append( - (d.asnumpy(), int(p.asnumpy()), int(l.asnumpy())) + (i.asnumpy(), int(p.asnumpy()), int(l.asnumpy())) ) return mislabelled ``` @@ -209,8 +245,8 @@ We can now get the mislabelled images in the training and validation sets and pl import numpy as np sample_size = 8 -wrong_train = get_mislabelled(train_data) -wrong_val = get_mislabelled(val_data) +wrong_train = get_mislabelled(train_loader) +wrong_val = get_mislabelled(val_loader) wrong_train_sample = [wrong_train[i] for i in np.random.randint(0, len(wrong_train), size=sample_size)] wrong_val_sample = [wrong_val[i] for i in np.random.randint(0, len(wrong_val), size=sample_size)] @@ -237,7 +273,8 @@ for ax, (img, pred, lbl) in zip(axs, wrong_val_sample): ![png](./wrong_train.png) ![png](./wrong_val.png) -In this case, it is rather obvious that our MLP network is too simple to perform really great on this dataset, as can be seen from the fact that some of the mislabelled examples are rather "simple" and should not be a challenge for our neural net. As it turns out, moving to the CNN architecture presented in the following section will give a big performance boost. +In this case, it is rather obvious that our MLP network is either too simple or not trained long enough to perform really great on this dataset, as can be seen from the fact that some of the mislabelled examples are rather "easy" and should not be a challenge for our neural net. +As it turns out, moving to the CNN architecture presented in the following section will give a big performance boost. ### Convolutional Neural Network @@ -376,6 +413,6 @@ If all went well, we should see a higher accuracy metric for predictions made us ## Summary -In this tutorial, we have learned how to use MXNet to solve a standard computer vision problem: classifying images of hand written digits. You have seen how to quickly and easily build, train and evaluate models such as MLP and CNN with MXNet Gluon package. +In this tutorial, we have learned how to use MXNet to solve a standard computer vision problem: classifying images of hand-written digits. You have seen how to quickly and easily build, train and evaluate models such as MLP and CNN with MXNet Gluon package. From 9b4cda07d5af5e62ffa69bf95e55bb24491ddcbc Mon Sep 17 00:00:00 2001 From: Holger Kohr Date: Tue, 27 Nov 2018 00:43:27 +0100 Subject: [PATCH 05/10] Fix broken links --- docs/tutorials/gluon/mnist.md | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/docs/tutorials/gluon/mnist.md b/docs/tutorials/gluon/mnist.md index f0f6cda6a423..4f0c45cd78e6 100644 --- a/docs/tutorials/gluon/mnist.md +++ b/docs/tutorials/gluon/mnist.md @@ -8,13 +8,13 @@ MNIST is a widely used dataset for the hand-written digit classification task. I **Figure 1:** Sample images from the MNIST dataset. -This tutorial uses MXNet's high-level *Gluon* interface to implement neural networks in an imperative fashion. It is based on [the corresponding tutorial written with the symbolic approach](https://mxnet.io/tutorials/python/mnist.html). +This tutorial uses MXNet's high-level *Gluon* interface to implement neural networks in an imperative fashion. It is based on [the corresponding tutorial written with the symbolic approach](https://mxnet.incubator.apache.org/tutorials/python/mnist.html). ## Prerequisites To complete this tutorial, you need: -- MXNet. See the instructions for your operating system in [Setup and Installation](https://mxnet.io/install/index.html). +- MXNet. See the instructions for your operating system in [Setup and Installation](https://mxnet.incubator.apache.org/install/index.html). - The Python [`requests`](http://docs.python-requests.org/en/master/) library. - (Optional) The [Jupyter Notebook](https://jupyter.org/index.html) software for interactively running the provided `.ipynb` file. @@ -97,7 +97,7 @@ Common choices for activation functions are [sigmoid](https://en.wikipedia.org/w In this example, we'll use the ReLU activation function since it has several nice properties that make it a good default choice. The following code declares three fully connected (or *dense*) layers with 128, 64 and 10 neurons each, where the last number of neurons matches the number of output classes in our dataset. -To build the neural network, we use a [`Sequential` layer](https://mxnet.io/api/python/gluon/gluon.html#mxnet.gluon.nn.Sequential), which is a convenience class to build a linear stack of layers, often called a *feed-forward neural net*. +To build the neural network, we use a [`Sequential` layer](https://mxnet.incubator.apache.org/api/python/gluon/gluon.html#mxnet.gluon.nn.Sequential), which is a convenience class to build a linear stack of layers, often called a *feed-forward neural net*. ```python net = nn.Sequential() @@ -110,7 +110,7 @@ with net.name_scope(): #### Initialize parameters and optimizer Before the network can be used, its parameters (weight and bias) need to be set to initial values that are sufficiently random while keeping the magnitude of gradients limited. -The [Xavier](https://mxnet.io/api/python/optimization/optimization.html#mxnet.initializer.Xavier) initializer is usually a good default choice. +The [Xavier](https://mxnet.incubator.apache.org/api/python/optimization/optimization.html#mxnet.initializer.Xavier) initializer is usually a good default choice. Since the `net.initialize()` method creates arrays for its parameters, it needs to know where to store the values: in CPU or GPU memory. Like many other functions and classes that deal with memory management in one way or another, it takes an optional `ctx` (short for *context*) argument, where the return value of either `mx.cpu()` or `mx.gpu()` can be provided. @@ -129,8 +129,8 @@ It determines the size of steps that the algorithm takes in search of parameters In general, hyperparameters refer to *non-learnable* values that need to be chosen before training and that have an effect on the outcome. In this example, further hyperparameters are the number of layers in the network, the number of neurons of the first two layers, the activation function and (later) the loss function. -The SGD optimization method can be accessed in MXNet Gluon through the [`Trainer`](https://mxnet.io/api/python/gluon/gluon.html#trainer) class. -Internally, it makes use of the [`SGD`](https://mxnet.io/api/python/optimization/optimization.html#mxnet.optimizer.SGD) optimizer class. +The SGD optimization method can be accessed in MXNet Gluon through the [`Trainer`](https://mxnet.incubator.apache.org/api/python/gluon/gluon.html#trainer) class. +Internally, it makes use of the [`SGD`](https://mxnet.incubator.apache.org/api/python/optimization/optimization.html#mxnet.optimizer.SGD) optimizer class. ```python trainer = gluon.Trainer( From 54b09aec1cef85474aaa9eb5210ee4453512443f Mon Sep 17 00:00:00 2001 From: Holger Kohr Date: Tue, 27 Nov 2018 00:45:29 +0100 Subject: [PATCH 06/10] Fix spelling of mislabeled --- docs/tutorials/gluon/mnist.md | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/docs/tutorials/gluon/mnist.md b/docs/tutorials/gluon/mnist.md index 4f0c45cd78e6..a0f1030b1e63 100644 --- a/docs/tutorials/gluon/mnist.md +++ b/docs/tutorials/gluon/mnist.md @@ -222,9 +222,9 @@ It is always a good idea to actually look at the images on which the network per We do that with the help of a small function that produces a list of the images which the network got wrong, together with the predicted and true labels. ```python -def get_mislabelled(loader): - """Return list of ``(input, pred_lbl, true_lbl)`` for mislabelled samples.""" - mislabelled = [] +def get_mislabeled(loader): + """Return list of ``(input, pred_lbl, true_lbl)`` for mislabeled samples.""" + mislabeled = [] for inputs, labels in loader: inputs = inputs.as_in_context(ctx) labels = labels.as_in_context(ctx) @@ -233,20 +233,20 @@ def get_mislabelled(loader): preds = nd.argmax(outputs, axis=1) for i, p, l in zip(inputs, preds, labels): if p != l: - mislabelled.append( + mislabeled.append( (i.asnumpy(), int(p.asnumpy()), int(l.asnumpy())) ) - return mislabelled + return mislabeled ``` -We can now get the mislabelled images in the training and validation sets and plot a selection of them: +We can now get the mislabeled images in the training and validation sets and plot a selection of them: ```python import numpy as np sample_size = 8 -wrong_train = get_mislabelled(train_loader) -wrong_val = get_mislabelled(val_loader) +wrong_train = get_mislabeled(train_loader) +wrong_val = get_mislabeled(val_loader) wrong_train_sample = [wrong_train[i] for i in np.random.randint(0, len(wrong_train), size=sample_size)] wrong_val_sample = [wrong_val[i] for i in np.random.randint(0, len(wrong_val), size=sample_size)] @@ -273,7 +273,7 @@ for ax, (img, pred, lbl) in zip(axs, wrong_val_sample): ![png](./wrong_train.png) ![png](./wrong_val.png) -In this case, it is rather obvious that our MLP network is either too simple or not trained long enough to perform really great on this dataset, as can be seen from the fact that some of the mislabelled examples are rather "easy" and should not be a challenge for our neural net. +In this case, it is rather obvious that our MLP network is either too simple or not trained long enough to perform really great on this dataset, as can be seen from the fact that some of the mislabeled examples are rather "easy" and should not be a challenge for our neural net. As it turns out, moving to the CNN architecture presented in the following section will give a big performance boost. ### Convolutional Neural Network From 92e199faff007caf4bc728b51c86f083de52f34b Mon Sep 17 00:00:00 2001 From: Holger Kohr Date: Wed, 28 Nov 2018 23:04:17 +0100 Subject: [PATCH 07/10] Final rewordings and code simplifications --- docs/tutorials/gluon/mnist.md | 316 ++++++++++++++++++---------------- 1 file changed, 166 insertions(+), 150 deletions(-) diff --git a/docs/tutorials/gluon/mnist.md b/docs/tutorials/gluon/mnist.md index a0f1030b1e63..5b5bffbfb809 100644 --- a/docs/tutorials/gluon/mnist.md +++ b/docs/tutorials/gluon/mnist.md @@ -27,7 +27,8 @@ $ pip install requests jupyter The following code downloads the MNIST dataset to the default location (`.mxnet/datasets/mnist/` in your home directory) and creates `Dataset` objects `train_data` and `val_data` for training and validation, respectively. These objects can later be used to get one image or a batch of images at a time, together with their corresponding labels. -We also add a `transform` function that rescales the images from `[0, 255]` to `[0, 1]`. +We also immediately apply the `transform_first()` method and supply a function that moves the channel axis of the images to the beginning (`(28, 28, 1) -> (1, 28, 28)`), casts them to `float32` and rescales them from `[0, 255]` to `[0, 1]`. +The name `transform_first` reflects the fact that these datasets contain images and labels, and that the transform should only be applied to the first of each `(image, label)` pair. ```python import mxnet as mx @@ -35,22 +36,20 @@ import mxnet as mx # Select a fixed random seed for reproducibility mx.random.seed(42) -train_data = mx.gluon.data.vision.MNIST( - train=True, - transform=lambda data, label: (data.astype("float32") / 255, label), -) -val_data = mx.gluon.data.vision.MNIST( - train=False, - transform=lambda data, label: (data.astype("float32") / 255, label), -) +def data_xform(data): + """Move channel axis to the beginning, cast to float32, and normalize to [0, 1].""" + return nd.moveaxis(data, 2, 0).astype('float32') / 255 + +train_data = mx.gluon.data.vision.MNIST(train=True).transform_first(data_xform) +val_data = mx.gluon.data.vision.MNIST(train=False).transform_first(data_xform) ``` -Since the MNIST dataset is relatively small, this class loads it into memory all at once, but for larger datasets like ImageNet, this would no longer be possible. +Since the MNIST dataset is relatively small, the `MNIST` class loads it into memory all at once, but for larger datasets like ImageNet, this would no longer be possible. The Gluon `Dataset` class from which `MNIST` derives supports both cases. -In general, `Dataset` and `DataLoader` (which we'll see in a second) are the machinery in MXNet that provides a stream of input data to be consumed by a training algorithm, typically in batches of multiple data entities at once for better efficiency. +In general, `Dataset` and `DataLoader` (which we will encounter next) are the machinery in MXNet that provides a stream of input data to be consumed by a training algorithm, typically in batches of multiple data entities at once for better efficiency. In this tutorial, we will configure the data loader to feed examples in batches of 100. -An image batch is commonly represented by a 4-D array with shape `(batch_size, num_channels, height, width)`. +An image batch is commonly represented as a 4-D array with shape `(batch_size, num_channels, height, width)`. This convention is denoted by "BCHW", and it is the default in MXNet. For the MNIST dataset, each image has a size of 28x28 pixels and one color channel (grayscale), hence the shape of an input batch will be `(batch_size, 1, 28, 28)`. @@ -69,12 +68,12 @@ val_loader = mx.gluon.data.DataLoader(val_data, shuffle=False, batch_size=batch_ ## Approaches -We will cover a couple of approaches for performing the hand-written digit recognition task. -In our first attempt, we will make use of a traditional deep neural network architecture called Multilayer Perceptron (MLP). -Although this architecture gets us up to about 95.5 % accuracy on the validation set, we will recognize and discuss some of its drawbacks and use them as a motivation for using a different network. -In that second attempt, we introduce the more advanced and very widely used Convolutional Neural Network (CNN) architecture that has proven to work very well for image classification tasks. +We will cover two approaches for performing the hand-written digit recognition task. +In our first attempt, we will make use of a traditional neural network architecture called [Multilayer Perceptron (MLP)](https://en.wikipedia.org/wiki/Multilayer_perceptron). +Although this architecture lets us achieve about 95.5 % accuracy on the validation set, we will recognize and discuss some of its drawbacks and use them as a motivation for using a different network. +In the subsequent second attempt, we introduce the more advanced and very widely used [Convolutional Neural Network (CNN)](https://en.wikipedia.org/wiki/Convolutional_neural_network) architecture that has proven to work very well for image classification tasks. -As a first step, we do some convenience imports of frequently used modules. +As a first step, we run some convenience imports of frequently used modules. ```python from __future__ import print_function # only relevant for Python 2 @@ -83,7 +82,7 @@ from mxnet import nd, gluon, autograd from mxnet.gluon import nn ``` -### Define a network: Multilayer Perceptron +### Defining a network: Multilayer Perceptron MLPs consist of several fully connected layers. In a fully connected (short: FC) layer, each neuron is connected to every neuron in its preceding layer. @@ -96,24 +95,31 @@ This step is crucial since it gives neural networks the ability to classify inpu Common choices for activation functions are [sigmoid](https://en.wikipedia.org/wiki/Sigmoid_function), [hyperbolic tangent ("tanh")](https://en.wikipedia.org/wiki/Hyperbolic_function#Definitions), and [rectified linear unit (ReLU)](https://en.wikipedia.org/wiki/Rectifier_(neural_networks)). In this example, we'll use the ReLU activation function since it has several nice properties that make it a good default choice. -The following code declares three fully connected (or *dense*) layers with 128, 64 and 10 neurons each, where the last number of neurons matches the number of output classes in our dataset. +The following code snippet declares three fully connected (or *dense*) layers with 128, 64 and 10 neurons each, where the last number of neurons matches the number of output classes in our dataset. +Note that the last layer uses no activation function since the [softmax](https://mxnet.incubator.apache.org/api/python/ndarray/ndarray.html#mxnet.ndarray.softmax) activation will be implicitly applied by the loss function later on. To build the neural network, we use a [`Sequential` layer](https://mxnet.incubator.apache.org/api/python/gluon/gluon.html#mxnet.gluon.nn.Sequential), which is a convenience class to build a linear stack of layers, often called a *feed-forward neural net*. +**Note**: using the `name_scope()` context manager is optional. +It is, however, good practice since it uses a common prefix for the names of all layers generated in that scope, which can be very helpful during debugging. + ```python -net = nn.Sequential() +net = nn.Sequential('MLP') with net.name_scope(): - net.add(nn.Dense(128, activation='relu')) - net.add(nn.Dense(64, activation='relu')) - net.add(nn.Dense(10)) + net.add( + nn.Flatten(), + nn.Dense(128, activation='relu'), + nn.Dense(64, activation='relu'), + nn.Dense(10, activation='relu') + ) ``` -#### Initialize parameters and optimizer +#### Initializing parameters and optimizer -Before the network can be used, its parameters (weight and bias) need to be set to initial values that are sufficiently random while keeping the magnitude of gradients limited. +Before the network can be used, its parameters (weights and biases) need to be set to initial values that are sufficiently random while keeping the magnitude of gradients limited. The [Xavier](https://mxnet.incubator.apache.org/api/python/optimization/optimization.html#mxnet.initializer.Xavier) initializer is usually a good default choice. Since the `net.initialize()` method creates arrays for its parameters, it needs to know where to store the values: in CPU or GPU memory. -Like many other functions and classes that deal with memory management in one way or another, it takes an optional `ctx` (short for *context*) argument, where the return value of either `mx.cpu()` or `mx.gpu()` can be provided. +Like many other functions and classes that deal with memory management in one way or another, the `initialize()` method takes an optional `ctx` (short for *context*) argument, where the return value of either `mx.cpu()` or `mx.gpu()` can be provided. ```python ctx = mx.gpu(0) if mx.context.num_gpus() > 0 else mx.cpu(0) @@ -125,8 +131,9 @@ More specifically, we use mini-batch SGD in contrast to the classical SGD that p (Recall that we set the batch size to 100 in the ["Loading Data"](#loading-data) part.) Besides the batch size, the SGD algorithm has one important *hyperparameter*: the *learning rate*. -It determines the size of steps that the algorithm takes in search of parameters that allow the network to optimally fit the training data, and as such it has great influence on both the course of the training process and its final outcome. -In general, hyperparameters refer to *non-learnable* values that need to be chosen before training and that have an effect on the outcome. +It determines the size of steps that the algorithm takes in search of parameters that allow the network to optimally fit the training data. +Therefore, this value has great influence on both the course of the training process and its final outcome. +In general, hyperparameters refer to *non-learnable* values that need to be chosen before training and that have a potential effect on the outcome. In this example, further hyperparameters are the number of layers in the network, the number of neurons of the first two layers, the activation function and (later) the loss function. The SGD optimization method can be accessed in MXNet Gluon through the [`Trainer`](https://mxnet.incubator.apache.org/api/python/gluon/gluon.html#trainer) class. @@ -140,31 +147,32 @@ trainer = gluon.Trainer( ) ``` -#### Train the network +#### Training -Training the network requires a way to tell how well the network currently fits the training data, or how badly, expressed as a "loss" value, as it is customary in optimization. -Ideally, in a classification task, we would use the prediction inaccuracy, i.e., the fraction of incorrectly classified samples, to guide the training to a lower value. -Unfortunately, inaccuracy is a poor choice for training since it contains almost no information that can be used to update the network parameters (its gradient is zero almost everywhere). +Training the network requires a way to tell how well the network currently fits the training data. +Following common practice in optimization, this quality of fit is expressed through a *loss value* (also referred to as badness-of-fit or data discrepancy), which the algorithm then tries to minimize by adjusting the weights of the model. +Ideally, in a classification task, we would like to use the prediction inaccuracy, i.e., the fraction of incorrectly classified samples, to guide the training to a lower value. +Unfortunately, inaccuracy is a poor choice for training since it contains almost no information that can be used to update the network parameters (its gradient is zero almost everywhere). As a better behaved proxy for inaccuracy, the [softmax cross-entropy loss](https://mxnet.incubator.apache.org/api/python/gluon/loss.html#mxnet.gluon.loss.SoftmaxCrossEntropyLoss) is a popular choice. It has the essential property of being minimal for the correct prediction, but at the same time, it is everywhere differentiable with nonzero gradient. - -We only use [accuracy](https://mxnet.incubator.apache.org/api/python/metric/metric.html#mxnet.metric.Accuracy) to monitor the training progress, since it is more intuitively interpretable. +The [accuracy](https://mxnet.incubator.apache.org/api/python/metric/metric.html#mxnet.metric.Accuracy) metric is still useful for monitoring the training progress, since it is more intuitively interpretable than a loss value. ```python metric = mx.metric.Accuracy() loss_function = gluon.loss.SoftmaxCrossEntropyLoss() ``` -Typically, the training is run until convergence, which means that further iterations will not improve the result any more, and that the network has probably learned a good set of model parameters from the train data. -For the purpose of this tutorial, we only loop 10 times over the entire dataset; one such pass over the data is usually called an *epoch*. +Typically, the training is run until convergence, which means that further iterations will no longer lead to improvements of the loss function, and that the network has probably learned a good set of model parameters from the train data. +For the purpose of this tutorial, we only loop 10 times over the entire dataset. +One such pass over the data is usually called an *epoch*. The following steps are taken in each `epoch`: - Get a minibatch of `inputs` and `labels` from the `train_loader`. - Feed the `inputs` to the network, producing `outputs`. -- Compute the minibatch loss value by comparing `outputs` to `labels`. -- Backpropagate the gradients to update the network parameters. +- Compute the minibatch `loss` value by comparing `outputs` to `labels`. +- Backpropagate the gradients to update the network parameters by calling `loss.backward()`. - Print the current accuracy over the training data, i.e., the fraction of correctly classified training examples. ```python @@ -198,10 +206,16 @@ for epoch in range(num_epochs): metric.reset() ``` -#### Prediction +#### Validation + +When the above training has completed, we can evaluate the trained model by comparing predictions from the validation dataset with their respective correct labels. +It is important to notice that the validation data was not used during training, i.e., the network has not seen the images and their true labels yet. +Keeping a part of the data aside for validation is crucial for detecting *overfitting* of a network: If a neural network has enough parameters, it can simply memorize the training data and look up the true label for a given training image. +While this results in 100 % training accuracy, such an overfit model would perform very poorly on new data. +In other words, an overfit model does not generalize to a broader class of inputs than the training set, and such an outcome is almost always undesirable. +Therefore, having a subset of "unseen" data for validation is an important part of good practice in machine learning. -When the above training has completed, we can evaluate the trained model by running predictions on validation dataset. -Since the dataset also has labels for all test images, we can compute the accuracy metric over validation data as follows: +To validate our model on the validation data, we can run the following snippet of code: ```python metric = mx.metric.Accuracy() @@ -234,7 +248,7 @@ def get_mislabeled(loader): for i, p, l in zip(inputs, preds, labels): if p != l: mislabeled.append( - (i.asnumpy(), int(p.asnumpy()), int(l.asnumpy())) + (i.asnumpy(), int(p.asscalar()), int(l.asscalar())) ) return mislabeled ``` @@ -273,146 +287,148 @@ for ax, (img, pred, lbl) in zip(axs, wrong_val_sample): ![png](./wrong_train.png) ![png](./wrong_val.png) -In this case, it is rather obvious that our MLP network is either too simple or not trained long enough to perform really great on this dataset, as can be seen from the fact that some of the mislabeled examples are rather "easy" and should not be a challenge for our neural net. +In this case, it is rather obvious that our MLP network is either too simple or has not been trained long enough to perform really great on this dataset, as can be seen from the fact that some of the mislabeled examples are rather "easy" and should not be a challenge for our neural net. As it turns out, moving to the CNN architecture presented in the following section will give a big performance boost. ### Convolutional Neural Network -Earlier, we briefly touched on a drawback of MLP when we said we need to discard the input image's original shape and flatten it as a vector before we can feed it as input to the MLP's first fully connected layer. Turns out this is an important issue because we don't take advantage of the fact that pixels in the image have natural spatial correlation along the horizontal and vertical axes. A convolutional neural network (CNN) aims to address this problem by using a more structured weight representation. Instead of flattening the image and doing a simple matrix-matrix multiplication, it employs one or more convolutional layers that each performs a 2-D convolution on the input image. +A fundamental issue with the MLP network is that it requires the inputs to be flattened (in the non-batch axes) before they can be processed by the dense layers. +This means in particular that the spatial structure of an image is largely discarded, and that the values describing it are just treated as a long vector. +The network then has to figure out the neighborhood relations of pixels from scratch by adjusting its weights accordingly, which seems very wasteful. -A single convolution layer consists of one or more filters that each play the role of a feature detector. During training, a CNN learns appropriate representations (parameters) for these filters. Similar to MLP, the output from the convolutional layer is transformed by applying a non-linearity. Besides the convolutional layer, another key aspect of a CNN is the pooling layer. A pooling layer serves to make the CNN translation invariant: a digit remains the same even when it is shifted left/right/up/down by a few pixels. A pooling layer reduces a *n x m* patch into a single value to make the network less sensitive to the spatial location. Pooling layer is always included after each conv (+ activation) layer in the CNN. +A convolutional neural network (CNN) aims to address this problem by using a more structured weight representation. +Instead of connecting all inputs to all outputs, the characteristic [convolution layer](https://mxnet.incubator.apache.org/api/python/gluon/nn.html#mxnet.gluon.nn.Conv2D) only considers a small neighborhood of a pixel to compute the value of the corresponding output pixel. +In particular, the spatial structure of the image is preserved, i.e., one can speak of input and output pixels in the first place. +Only the size of the image may change through convolutions. +[This article](http://deeplearning.net/software/theano/tutorial/conv_arithmetic.html) gives a good and intuitive explanation of convolutions in the context of deep learning. -The following source code defines a convolutional neural network architecture called LeNet. LeNet is a popular network known to work well on digit classification tasks. We will use a slightly different version from the original LeNet implementation, replacing the sigmoid activations with tanh activations for the neurons. - -A typical way to write your network is creating a new class inherited from `gluon.Block` -class. We can define the network by composing and inheriting Block class as follows: +The size of the neighborhood that a convolution layer considers for each pixel is usually referred to as *filter size* or *kernel size*. +The array of weights -- which does not depend on the output pixel location, only on the position within such a neighborhood -- is called *filter* or *kernel*. +Typical filter sizes range from *3 x 3* to *13 x 13*, which implies that a convolution layer has *far* fewer parameters than a dense layer. ```python -import mxnet.ndarray as F - -class Net(gluon.Block): - def __init__(self, **kwargs): - super(Net, self).__init__(**kwargs) - with self.name_scope(): - # layers created in name_scope will inherit name space - # from parent layer. - self.conv1 = nn.Conv2D(20, kernel_size=(5,5)) - self.pool1 = nn.MaxPool2D(pool_size=(2,2), strides = (2,2)) - self.conv2 = nn.Conv2D(50, kernel_size=(5,5)) - self.pool2 = nn.MaxPool2D(pool_size=(2,2), strides = (2,2)) - self.fc1 = nn.Dense(500) - self.fc2 = nn.Dense(10) - - def forward(self, x): - x = self.pool1(F.tanh(self.conv1(x))) - x = self.pool2(F.tanh(self.conv2(x))) - # 0 means copy over size from corresponding dimension. - # -1 means infer size from the rest of dimensions. - x = x.reshape((0, -1)) - x = F.tanh(self.fc1(x)) - x = F.tanh(self.fc2(x)) - return x +conv_layer = nn.Conv2D(kernel_size=(3, 3), channels=32, in_channels=16, activation='relu') +print(conv_layer.params) +# Output: +# conv0_ ( +# Parameter conv0_weight (shape=(32, 16, 3, 3), dtype=) +# Parameter conv0_bias (shape=(32,), dtype=) +# ) ``` -We just defined the forward function here, and the backward function to compute gradients -is automatically defined for you using autograd. -We also imported `mxnet.ndarray` package to use activation functions from `ndarray` API. +Filters can be thought of as little feature detectors: in early layers, they learn to detect small local structures like edges, whereas later layers become sensitive to more and more global structures. +Since images often contain a rich set of such features, it is customary to have each convolution layer employ and learn many different filters in parallel, so as to detect many different image features on their respective scales. +This stacking of filters, which directly translates to a stacking of output images, is referred to as output *channels* of the convolution layer. +Likewise, the input can already have multiple channels. +In the above example, the convolution layer takes an input image with 16 channels and maps it to an image with 32 channels by convolving each of the input channels with a different set of 32 filters and then summing over the 16 input channels. +Therefore, the total number of filter parameters in the convolution layer is `channels * in_channels * prod(kernel_size)`, which amounts to 4608 in the above example. + +Another characteristic feature of CNNs is the usage of *pooling*, i.e., summarizing patches to a single number, to shrink the size of an image as it travels through the layers. +This step lowers the computational burden of training the network, but the main motivation for pooling is the assumption that it makes the network less sensitive to small translations, rotations or deformations of the image. +Popular pooling strategies are max-pooling and average-pooling, and they are usually performed after convolution. + +The following code defines a CNN architecture called *LeNet*. +The LeNet architecture is a popular network known to work well on digit classification tasks. +We will use a version that differs slightly from the original in the usage of `tanh` activations instead of `sigmoid`. -Now, We will create the network as follows: +```python +lenet = nn.Sequential('LeNet') +lenet.add( + nn.Conv2D(channels=20, kernel_size=(5, 5), activation='tanh'), + nn.MaxPool2D(pool_size=(2, 2), strides=(2, 2)), + nn.Conv2D(channels=50, kernel_size=(5, 5), activation='tanh'), + nn.MaxPool2D(pool_size=(2, 2), strides=(2, 2)), + nn.Flatten(), + nn.Dense(500, activation='tanh'), + nn.Dense(10, activation='tanh'), +) +``` + +To get an overview of all intermediate sizes of arrays and the number of parameters in each layer, the `summary()` method can be a great help. +It requires the network parameters to be initialized, and an input array to infer the sizes. ```python -net = Net() +lenet.initialize(mx.init.Xavier(), ctx=ctx) +lenet.summary(nd.zeros((1, 1, 28, 28), ctx=ctx)) +# Output: +# +# -------------------------------------------------------------------------------- +# Layer (type) Output Shape Param # +# ================================================================================ +# Input (1, 1, 28, 28) 0 +# Activation-1 0 +# Activation-2 (1, 20, 24, 24) 0 +# Conv2D-3 (1, 20, 24, 24) 520 +# MaxPool2D-4 (1, 20, 12, 12) 0 +# Activation-5 0 +# Activation-6 (1, 50, 8, 8) 0 +# Conv2D-7 (1, 50, 8, 8) 25050 +# MaxPool2D-8 (1, 50, 4, 4) 0 +# Flatten-9 (1, 800) 0 +# Activation-10 0 +# Activation-11 (1, 500) 0 +# Dense-12 (1, 500) 400500 +# Activation-13 0 +# Activation-14 (1, 10) 0 +# Dense-15 (1, 10) 5010 +# ================================================================================ +# Parameters in forward computation graph, duplicate included +# Total params: 431080 +# Trainable params: 431080 +# Non-trainable params: 0 +# Shared params in forward computation graph: 0 +# Unique parameters in model: 431080 +# -------------------------------------------------------------------------------- ``` ![png](https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/image/conv_mnist.png) **Figure 3:** First conv + pooling layer in LeNet. -Now we train LeNet with similar hyper-parameters as before. Note that, if a GPU is available, we recommend using it. This greatly speeds up computation given that LeNet is more complex and compute-intensive than the previous multilayer perceptron. To do so, we only need to change `mx.cpu()` to `mx.gpu()` and MXNet takes care of the rest. Just like before, we'll stop training after 10 epochs. +Now we train LeNet with similar hyperparameters and procedure as before. +Note that it is advisable to use a GPU if possible, since this model is significantly more computationally demanding to evaluate and train than the previous MLP. -Training and prediction can be done in the similar way as we did for MLP. +```python +trainer = gluon.Trainer( + params=lenet.collect_params(), + optimizer='sgd', + optimizer_params={'learning_rate': 0.02}, +) +metric = mx.metric.Accuracy() +num_epochs = 10 -#### Initialize parameters and optimizer +for epoch in range(num_epochs): + for inputs, labels in train_loader: + inputs = inputs.as_in_context(ctx) + labels = labels.as_in_context(ctx) -We will initialize the network parameters as follows: + with autograd.record(): + outputs = lenet(inputs) + loss = loss_function(outputs, labels) -```python -# set the context on GPU is available otherwise CPU -ctx = [mx.gpu() if mx.test_utils.list_gpus() else mx.cpu()] -net.initialize(mx.init.Xavier(magnitude=2.24), ctx=ctx) -trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': 0.03}) -``` + loss.backward() + metric.update(labels, outputs) -#### Training + trainer.step(batch_size=inputs.shape[0]) -```python -# Use Accuracy as the evaluation metric. -metric = mx.metric.Accuracy() -softmax_cross_entropy_loss = gluon.loss.SoftmaxCrossEntropyLoss() - -for i in range(num_epochs): - # Reset the train data iterator. - train_data.reset() - # Loop over the train data iterator. - for batch in train_data: - # Splits train data into multiple slices along batch_axis - # and copy each slice into a context. - data = gluon.utils.split_and_load(batch.data[0], ctx_list=ctx, batch_axis=0) - # Splits train labels into multiple slices along batch_axis - # and copy each slice into a context. - label = gluon.utils.split_and_load(batch.label[0], ctx_list=ctx, batch_axis=0) - outputs = [] - # Inside training scope - with ag.record(): - for x, y in zip(data, label): - z = net(x) - # Computes softmax cross entropy loss. - loss = softmax_cross_entropy_loss(z, y) - # Backpropogate the error for one iteration. - loss.backward() - outputs.append(z) - # Updates internal evaluation - metric.update(label, outputs) - # Make one step of parameter update. Trainer needs to know the - # batch size of data to normalize the gradient by 1/batch_size. - trainer.step(batch.data[0].shape[0]) - # Gets the evaluation result. name, acc = metric.get() - # Reset evaluation result to initial state. + print('After epoch {}: {} = {}'.format(epoch + 1, name, acc)) metric.reset() - print('training acc at epoch %d: %s=%f'%(i, name, acc)) -``` -#### Prediction - -Finally, we'll use the trained LeNet model to generate predictions for the test data. - -```python -# Use Accuracy as the evaluation metric. -metric = mx.metric.Accuracy() -# Reset the validation data iterator. -val_data.reset() -# Loop over the validation data iterator. -for batch in val_data: - # Splits validation data into multiple slices along batch_axis - # and copy each slice into a context. - data = gluon.utils.split_and_load(batch.data[0], ctx_list=ctx, batch_axis=0) - # Splits validation label into multiple slices along batch_axis - # and copy each slice into a context. - label = gluon.utils.split_and_load(batch.label[0], ctx_list=ctx, batch_axis=0) - outputs = [] - for x in data: - outputs.append(net(x)) - # Updates internal evaluation - metric.update(label, outputs) -print('validation acc: %s=%f'%metric.get()) -assert metric.get()[1] > 0.98 +for inputs, labels in val_loader: + inputs = inputs.as_in_context(ctx) + labels = labels.as_in_context(ctx) + metric.update(labels, lenet(inputs)) +print('Validaton: {} = {}'.format(*metric.get())) +assert metric.get()[1] > 0.975 ``` -If all went well, we should see a higher accuracy metric for predictions made using LeNet. With CNN we should be able to correctly predict around 98% of all test images. +If all went well, we should see a higher accuracy metric for predictions made using LeNet. +With this CNN we should be able to correctly predict around 98% of all test images. ## Summary -In this tutorial, we have learned how to use MXNet to solve a standard computer vision problem: classifying images of hand-written digits. You have seen how to quickly and easily build, train and evaluate models such as MLP and CNN with MXNet Gluon package. +In this tutorial, we demonstrated how to use MXNet to solve a standard computer vision problem: classifying images of hand-written digits. +We showed how to quickly build, train and evaluate models such as MLPs and CNNs with the MXNet Gluon package. From be966bd89ebcb34605fc1e627d80ff033da6196b Mon Sep 17 00:00:00 2001 From: Holger Kohr Date: Fri, 30 Nov 2018 00:47:01 +0100 Subject: [PATCH 08/10] Fix things according to review - Apply hybrid blocks - Move outputs outside of code blocks and mark for notebooks to ignore - Remove images, use external link - Fix a few formulations --- docs/tutorials/gluon/mnist.md | 109 +++++++++++++++------------ docs/tutorials/gluon/wrong_train.png | Bin 15741 -> 0 bytes docs/tutorials/gluon/wrong_val.png | Bin 15438 -> 0 bytes 3 files changed, 59 insertions(+), 50 deletions(-) delete mode 100644 docs/tutorials/gluon/wrong_train.png delete mode 100644 docs/tutorials/gluon/wrong_val.png diff --git a/docs/tutorials/gluon/mnist.md b/docs/tutorials/gluon/mnist.md index 5b5bffbfb809..9845fc8d95ff 100644 --- a/docs/tutorials/gluon/mnist.md +++ b/docs/tutorials/gluon/mnist.md @@ -50,12 +50,12 @@ In general, `Dataset` and `DataLoader` (which we will encounter next) are the ma In this tutorial, we will configure the data loader to feed examples in batches of 100. An image batch is commonly represented as a 4-D array with shape `(batch_size, num_channels, height, width)`. -This convention is denoted by "BCHW", and it is the default in MXNet. +This convention is denoted by "NCHW", and it is the default in MXNet. For the MNIST dataset, each image has a size of 28x28 pixels and one color channel (grayscale), hence the shape of an input batch will be `(batch_size, 1, 28, 28)`. Another important consideration is the order of input samples. -When feeding training examples, it is critical not feed samples with the same label in succession since doing so can slow down training. -Data iterators take care of this issue by randomly shuffling the inputs. +When feeding training examples, it is critical not feed samples with the same label in succession since doing so can slow down training progress. +Data iterators, i.e., instances of [`DataLoader`](https://mxnet.incubator.apache.org/api/python/gluon/data.html#mxnet.gluon.data.DataLoader), take care of this issue by randomly shuffling the inputs. Note that we only need to shuffle the training data -- for validation data, the order does not matter. The following code initializes the data iterators for the MNIST dataset. @@ -82,7 +82,7 @@ from mxnet import nd, gluon, autograd from mxnet.gluon import nn ``` -### Defining a network: Multilayer Perceptron +### Defining a network: Multilayer Perceptron (MLP) MLPs consist of several fully connected layers. In a fully connected (short: FC) layer, each neuron is connected to every neuron in its preceding layer. @@ -97,13 +97,15 @@ In this example, we'll use the ReLU activation function since it has several nic The following code snippet declares three fully connected (or *dense*) layers with 128, 64 and 10 neurons each, where the last number of neurons matches the number of output classes in our dataset. Note that the last layer uses no activation function since the [softmax](https://mxnet.incubator.apache.org/api/python/ndarray/ndarray.html#mxnet.ndarray.softmax) activation will be implicitly applied by the loss function later on. -To build the neural network, we use a [`Sequential` layer](https://mxnet.incubator.apache.org/api/python/gluon/gluon.html#mxnet.gluon.nn.Sequential), which is a convenience class to build a linear stack of layers, often called a *feed-forward neural net*. +To build the neural network, we use a [`HybridSequential`](https://mxnet.incubator.apache.org/api/python/gluon/gluon.html#mxnet.gluon.nn.HybridSequential) layer, which is a convenience class to build a linear stack of layers, often called a *feed-forward neural net*. -**Note**: using the `name_scope()` context manager is optional. -It is, however, good practice since it uses a common prefix for the names of all layers generated in that scope, which can be very helpful during debugging. +The "Hybrid" part of name `HybridSequential` refers to the fact that such a layer can be used with both the Gluon API and the Symbol API. +Using hybrid blocks over dynamic-only blocks (e.g. [`Sequential`](https://mxnet.incubator.apache.org/api/python/gluon/gluon.html#mxnet.gluon.nn.Sequential)) has several advantages apart from being compatible with a wider range of existing code: for instance, the computation graph of the network can be visualized with `mxnet.viz.plot_network()` and inspected for errors. +Unless a network requires non-static runtime elements like loops, conditionals or random layer selection in its forward pass, it is generally a good idea to err on the side of hybrid blocks. +For details on the differences, see the documentation on [`Block`](https://mxnet.incubator.apache.org/api/python/gluon/gluon.html#mxnet.gluon.Block) and [`HybridBlock`](https://mxnet.incubator.apache.org/api/python/gluon/gluon.html#mxnet.gluon.HybridBlock). ```python -net = nn.Sequential('MLP') +net = nn.HybridSequential('MLP') with net.name_scope(): net.add( nn.Flatten(), @@ -113,6 +115,9 @@ with net.name_scope(): ) ``` +**Note**: using the `name_scope()` context manager is optional. +It is, however, good practice since it uses a common prefix for the names of all layers generated in that scope, which can be very helpful during debugging. + #### Initializing parameters and optimizer Before the network can be used, its parameters (weights and biases) need to be set to initial values that are sufficiently random while keeping the magnitude of gradients limited. @@ -172,7 +177,8 @@ The following steps are taken in each `epoch`: - Get a minibatch of `inputs` and `labels` from the `train_loader`. - Feed the `inputs` to the network, producing `outputs`. - Compute the minibatch `loss` value by comparing `outputs` to `labels`. -- Backpropagate the gradients to update the network parameters by calling `loss.backward()`. +- Use backpropagation to compute the gradients of the loss with respect to each of the network parameters by calling `loss.backward()`. +- Update the parameters of the network according to the optimizer rule with `trainer.step(batch_size=inputs.shape[0])`. - Print the current accuracy over the training data, i.e., the fraction of correctly classified training examples. ```python @@ -185,8 +191,9 @@ for epoch in range(num_epochs): labels = labels.as_in_context(ctx) # The forward pass and the loss computation need to be wrapped - # in a `record()` scope to indicate that the results will - # be needed in the backward pass (gradient computation). + # in a `record()` scope to make sure the computational graph is + # recorded in order to automatically compute the gradients + # during the backward pass. with autograd.record(): outputs = net(inputs) loss = loss_function(outputs, labels) @@ -284,19 +291,19 @@ for ax, (img, pred, lbl) in zip(axs, wrong_val_sample): ax.xaxis.set_visible(False) ax.yaxis.set_visible(False) ``` -![png](./wrong_train.png) -![png](./wrong_val.png) +![png](https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/doc/tutorials/gluon/mnist_wrong_preds_train.png) +![png](https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/doc/tutorials/gluon/mnist_wrong_preds_val.png) In this case, it is rather obvious that our MLP network is either too simple or has not been trained long enough to perform really great on this dataset, as can be seen from the fact that some of the mislabeled examples are rather "easy" and should not be a challenge for our neural net. As it turns out, moving to the CNN architecture presented in the following section will give a big performance boost. -### Convolutional Neural Network +### Convolutional Neural Network (CNN) A fundamental issue with the MLP network is that it requires the inputs to be flattened (in the non-batch axes) before they can be processed by the dense layers. This means in particular that the spatial structure of an image is largely discarded, and that the values describing it are just treated as a long vector. The network then has to figure out the neighborhood relations of pixels from scratch by adjusting its weights accordingly, which seems very wasteful. -A convolutional neural network (CNN) aims to address this problem by using a more structured weight representation. +A CNN aims to address this problem by using a more structured weight representation. Instead of connecting all inputs to all outputs, the characteristic [convolution layer](https://mxnet.incubator.apache.org/api/python/gluon/nn.html#mxnet.gluon.nn.Conv2D) only considers a small neighborhood of a pixel to compute the value of the corresponding output pixel. In particular, the spatial structure of the image is preserved, i.e., one can speak of input and output pixels in the first place. Only the size of the image may change through convolutions. @@ -309,13 +316,12 @@ Typical filter sizes range from *3 x 3* to *13 x 13*, which implies that a convo ```python conv_layer = nn.Conv2D(kernel_size=(3, 3), channels=32, in_channels=16, activation='relu') print(conv_layer.params) -# Output: -# conv0_ ( -# Parameter conv0_weight (shape=(32, 16, 3, 3), dtype=) -# Parameter conv0_bias (shape=(32,), dtype=) -# ) ``` +`Parameter conv0_weight (shape=(32, 16, 3, 3), dtype=)` + +`Parameter conv0_bias (shape=(32,), dtype=)` + Filters can be thought of as little feature detectors: in early layers, they learn to detect small local structures like edges, whereas later layers become sensitive to more and more global structures. Since images often contain a rich set of such features, it is customary to have each convolution layer employ and learn many different filters in parallel, so as to detect many different image features on their respective scales. This stacking of filters, which directly translates to a stacking of output images, is referred to as output *channels* of the convolution layer. @@ -332,7 +338,7 @@ The LeNet architecture is a popular network known to work well on digit classifi We will use a version that differs slightly from the original in the usage of `tanh` activations instead of `sigmoid`. ```python -lenet = nn.Sequential('LeNet') +lenet = nn.HybridSequential('LeNet') lenet.add( nn.Conv2D(channels=20, kernel_size=(5, 5), activation='tanh'), nn.MaxPool2D(pool_size=(2, 2), strides=(2, 2)), @@ -350,35 +356,38 @@ It requires the network parameters to be initialized, and an input array to infe ```python lenet.initialize(mx.init.Xavier(), ctx=ctx) lenet.summary(nd.zeros((1, 1, 28, 28), ctx=ctx)) -# Output: -# -# -------------------------------------------------------------------------------- -# Layer (type) Output Shape Param # -# ================================================================================ -# Input (1, 1, 28, 28) 0 -# Activation-1 0 -# Activation-2 (1, 20, 24, 24) 0 -# Conv2D-3 (1, 20, 24, 24) 520 -# MaxPool2D-4 (1, 20, 12, 12) 0 -# Activation-5 0 -# Activation-6 (1, 50, 8, 8) 0 -# Conv2D-7 (1, 50, 8, 8) 25050 -# MaxPool2D-8 (1, 50, 4, 4) 0 -# Flatten-9 (1, 800) 0 -# Activation-10 0 -# Activation-11 (1, 500) 0 -# Dense-12 (1, 500) 400500 -# Activation-13 0 -# Activation-14 (1, 10) 0 -# Dense-15 (1, 10) 5010 -# ================================================================================ -# Parameters in forward computation graph, duplicate included -# Total params: 431080 -# Trainable params: 431080 -# Non-trainable params: 0 -# Shared params in forward computation graph: 0 -# Unique parameters in model: 431080 -# -------------------------------------------------------------------------------- +``` + +``` +Output: + +-------------------------------------------------------------------------------- + Layer (type) Output Shape Param # +================================================================================ + Input (1, 1, 28, 28) 0 + Activation-1 0 + Activation-2 (1, 20, 24, 24) 0 + Conv2D-3 (1, 20, 24, 24) 520 + MaxPool2D-4 (1, 20, 12, 12) 0 + Activation-5 0 + Activation-6 (1, 50, 8, 8) 0 + Conv2D-7 (1, 50, 8, 8) 25050 + MaxPool2D-8 (1, 50, 4, 4) 0 + Flatten-9 (1, 800) 0 + Activation-10 0 + Activation-11 (1, 500) 0 + Dense-12 (1, 500) 400500 + Activation-13 0 + Activation-14 (1, 10) 0 + Dense-15 (1, 10) 5010 +================================================================================ +Parameters in forward computation graph, duplicate included + Total params: 431080 + Trainable params: 431080 + Non-trainable params: 0 +Shared params in forward computation graph: 0 +Unique parameters in model: 431080 +-------------------------------------------------------------------------------- ``` ![png](https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/image/conv_mnist.png) diff --git a/docs/tutorials/gluon/wrong_train.png b/docs/tutorials/gluon/wrong_train.png deleted file mode 100644 index 2ddf291e944490ff1e375c615057ab87e96c6e43..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 15741 zcmdVBcUTi?+c%7>fDK{Sf(t@Wnn+XWT||L^z$(&9R3J!=5PFNkDosE`38AhUK%`0U zWL?1!5`qK)saa_uR1<1QNWO{pQ{L~ppZh+Z_s@^xK*nLnTr<~s{@PrV{F}9z&{46Y ze0+RDVDlSy`1lU)0o(nD1%Q7K2OU!YejI{d2j4vme8n90OalIYp%CKiG>e;C{#>e^`jW&jYD2uVAuYA&+!b zRsZ_{6!@;0PQ z-Yy>GZM-;tck=*mqgUh-?*{Slc^*C;%NP2QD3FI%t1lWRS>~Y&??XYkV?-H*x1FBN z!$|PhiHrreBI3H+tAOoWWRPR>reZ(wQ|?&qr9f_rCs)FuO20u)ZMl$*MKxb@J|?lw z_jttQCg`yYo;Z#kONMIxMU+F2xvJ&ggRY~;AjB2EPByUhQT9?vNlUFD#($nkl!(( zvIg!|`p~akLWn{*#7;y0?X!H@DOK9W(Gnwjaul-k?!|qJiN^6?->SWt%Rs@LrK+}M z`X4?C;p4Nq#v7o73Iq>Q_K~N${-$ub{sgk=o$05gKQAF~$YKq3s9m=Sci#aYo% zw!o4L;Kg_KgtxkNv`mykI<%?2+bstlEB!qf%ZF?TV1(au#iQ66M%2kvS|qgTIl9mr zs+v27f@s@0D{x4Y01 z1@cv{6HaOq^5EHMH5Q2|safAh7&`!MvSW1BWOC<^sj=$ORd=Aesrh41!I_D_LN$kp zrg>xM2cuB==*$MVFQvuPy>=6kSve}7hw!?Jc$IRSVH8`G0?$y)AIpJmdy-wH7qNd5 zHPz}rfKR56fzgw`P(|vPX*Kttn%)?ZGeCsOdd0M(m2Lm#j(sG$gZl!JXais9vNL*L zK?av?xy`kMZewKzigTTe)LfJXqQ%m2*~kS2kNy=w%SN1L{RwE;OLS-cTu>Xj6ADcf z?ZrKShWXS@je!lTuh#b*tAxByi>m{zUFDj8U%{LMhF^|n%-+EftZ5#-NVCBzbMaU; zmoZ`>m<-cKaPLF&=bf}-yH*QUntkx}eGIw^MKu6!q<|XBrfywMe&}Goe_9Mn+D7b{ z!HKD?j#K8*e_);sG{y~bmJ7Z2kZ>+hGxatqJ_wkvFf7uHTV40E8YjJP81@;x%R6CIkqgBDXzK6w}{K}%Ts7Wp@|2~%=$W~7a| z^IeWlf*^6_7}d3_X<5)>+>$Jfd}cZBNH#`>Y1Ri6!nr;xpeyyomTe|vf9Apwh`F;1 z`3Q1bt-h)do@VG=$o?vE$w!{q{-D`>Ns8u;=f1Cnug!RFeZx~`=MYlvgN}R2R9Ux+ zT=R|ka%Q^>V)Sx-5vlbGxJdKG7d!Hw`aS{DvqJ5_3Br?=oW_bzx4+?sq&7vCiaH zN4UmOV6wE|z~ZvSnf2oK>WH5sa@ji{?lIxZI6QardbzmLPOf@AVUBPQ$=(?}1Qxf@ zbs$eeo6_Kc@A?#h@-iUlxOYShGc7rSv~x^Q(oQc|+qLTDj*_$wQ-7n~a?gT{-B5z# zmo>qum<3QFu~>#X>zDkcP6Fwbal=OfWdWl+97E&T)Hlq7nM?!QjS((J4(u`!wV|_U(3?qR;Ou(C+<2#@pS=2C+A+ghm&Kp%=f}Y7vXd4c6Y3pj# zg!@to3%tb(=RopYWAcf?x?{*4A;tGFBh(jZ4Qwa7M@rs@@P%D6r+T74r{Q+iNMkvB zODPC@+-8JpPOd9wTJ}P;4{sZX=yR0a+#@+<8*hMJ4qa5sacvE5lIQ;DYmb)HnE7t7 zm}E(4~KgFJ`&rC%WG74ZD zGOJ>pR4sZOvH@~+{gZyA^DXin*_`FGV2{_4Uh^d{H>2g3A@27g$0jcAv?%OwRX5JA z$@$RrH^NaySnl0E&gb}z$T^zp-?=SU@*oYB5%ig!ibjZjoA5PKYq*SrEJAeg zBC}o2gSg_N7TrzUCTtWbs0>J`Q72K1aMKZuddWylEfaDisg+qCnJ|YiS5>6lqE1%c zjgp>6TnliOVyU(~UDuyhg4B-p5bYJecL|jBQ51BCZiY7%z*T!GLE{6&9A6m;Jf$@h z8Mj)HkFxNN$8f+A*N@On%@yP;KweIMA?8>?*K^szh!^U&?+ZeOhL~$Hoy!F*coCCj z>1<6ti09a_3S%NS6@Ke-X!|lGjE`&4JJS5TY0!xrOKRO++qYvBUe~;dpiYLDuMQxw ze9}xta{tSaz_t!R802IciB1QT+)(a0n8+vWsXwLfq>8OQbU!EYF|*+hwq;Nyret36 zzVysfyv~RxUV5UP0Le<`Haj%_4bDHf(Xk&#o!l7gn8?Jsd{VT``*q+EsXvHhrKm+w zet}AXRV3rA;PVF0G6T{>%iCwmnIC03Rk>NlIx6Pr)Mp`GsF32Ar!gB>$*p*u9mHUV z9uiBdo3Q8j{2^Khw=r9$s37jwxR#vD^p-)`gpjP#MjdZ(jB)$#1-{UUVqvk<_lQ%fE3)h>#oHb9V%bZi!YfKm;oWpgx_;T+0 zpU%inZS2C|H10NNDIigII;9ZQs^*x?r*3l~TLhRBG~$NTZ_LCV{$9L$!Suo1C!1Pn zjDp#Md_aTXDGJj!j-l3FBFU@^%yxfY_m6QAIs=-e`D5)2E~NF0uB1#_7WPFJjFJIX z>F#cNp}KS3q((W9ZPh$DeU}rsdYzjo7MW#8{*1fnTI-`A#JZ2!DBS9Ce^FBzcmYwS zUG;?_aUL=4{t%kqXEe`9VfU1P^780%3nd!O!&$SXtw$w33iX52v#`fHU&gpkl-Q$f z7n+=Xi<`Tt-7#v>b|qRmiqzamQ;yo!J;IRLb9VUDQ7BNWYZLl}1Ai<5+O-!Pw<&Dt zE_AvlVQwNAemy$UV*| zp3+hHXNeMuZBX-aOIM?=6A%mcpU+NvO)Rpncr(D60 z#`2SyZGW;C*s9F6Zmmh5TMn1Snr^>kAMJ+`udiFGuwxVvgL3u{LF(s3U1VymlW{#) zk#eWs>WhwB8w#-+3z~-J9}Y6CR5<44j8Kx521S8-nh{VdzIG1b0d2Nud-rGDB{OiO z&-s$T3I(86PwO?ssZzpX3S!`Cwa$;sSlY~Onr!NNpgg0HlIF8DkbE@*KR<_s?`a_Y z<&_s_1^C8K0TTYG#agibL#R>C7YB}YkcpuE{`|T?z*+$-d@mNd?afSy^ILSj$Faau zK7A5~Id(nYkWkdHGtpH%?bsFE;0}L|T#xW;p1b`mZBCIxj&J|ojt@VSTOB(Q5=N+x`&Ymltv~!1WHfB*sl}yAJ7I^_fG1ZuF@u}{>y+>DFn8O+hS9%Gr0?8EM#Si-+9cyyTA5edd3Pc`*7S zFHtC5B8*)xpCIp*#B z>iTt4cWn67=UWsO1T_p`e!ZZKkBqyFc6q`?+k@hhsSiJVe($C5EjPxETn8PgcE}W@z!tLiE;XOs_PMx6KCkTY`1#pe*ZTQogf;W$sbF~ zt&e&^Eqiy;Rg@P`y}u34r?Tb}=iAjGPrnjJZapja`}~44j1LnZS-!wE%3i8rgkQDM zsF&FA*OKJ2U6`-v7Rbrr82hf`r}bn4(O;37KGT&6J42R%Ha$Ua5$oT6JF)V-v~=n( zG0Aq)Xszd1Uc{t&4)!$BA1~+WeTxy!THJV>8G{RN{C=0J$PoQ4T(f@nuTk(N5A&}; zXNYHF!!g*|J3gVrjj^uF7Sd;)@{GD7icdr(jT~eMxVNxV{UFWe41V4k@ z*YT0JnINuS?~W%n@JM8S64llpzGp&;ddH-XG^4fV|=6tS?si|7XSvi zKL;xvQ5hp>BR!vZR?oc3VUU~1zHEu=g=FRC6We=r9LS$7Nv+ZS{tX!T9LQzbjibVJ zU}RSbTHNwZM{m8{^kcTaE+aU!nOf=+bar;Ga%w(3&jBsRMJi7UaG(A(Bnw!!oagTz z)w1dDpU3cXHJRsEx_m9#1~moHW8n-Ny>pkiA9rP$k~J`Md&OqT?>b9(0#9S6fnv3VGZR-_G#8N2@+Q&9+q z<>Zb2^sG;Kr{zt|O$ zjEu>HZ(#;=toTU#E>O-kQu8IEAg(SZQ<3l5?iibX-AmbmC*!*e5?XZ4m>Jj zkPH0D?7Z34ui!2>&J1J!>7+8HxLe+iD9+d5)bO9l+QT>hcUc=Rbi2wsC-3Xm>o-BX zpO26K-$r)4zxw~r$9TSe4TBo=6VXmAnYeLcdcf0F_dw{5eU#qCbVdy<3laNb+H5&F6GinG7R6`}3im(A!fb-;NYrfF8N0*w|I$ZeD_SuK`pnev6XiN|g!k+#D zXCh=FYi=^isTg z4!mx;RNVbhHddt^h1o`)$=-a6F|u5`=yMbNMy9Z!5|M%t&J1ZYkGhS_{9>3reFVSx z0r{y9VsMpt(yvk520Up+ZbokBU=Ne!lO4=+uyV8=!b=!Z4A0_Q3DCE5+O@x89-51hyKu*~0m%-9?Aek=08+VILWu z)8CNpw@M)|LM(j8)@~k+e2Y;N*ZYp#{RzBtWBGV?NF_VFI1;H_nj>T(8Z6D__JAc- zQ4P76Lgap}yI;1k1|$m`mW}N>~2okrT%|JlL?K~uT7m~c}ecJ$qNQbXQtF>E^Z>YHp=xDu~RQJ6L>>Va? zN2(V^XYUL_dLjKzhUKSlpA;rr4N@?yroGs;z`C<&cfIXYCleO2S`;y6^vSD{e zGM>1}hHBLhAtU&~!g*t`LO53WN}3WOiitb(@{2Jih#pnGD4My&s2OKc6e;+lMZsLeK1zsg%aFKRgZ8;E1uxNX49*lSyQjHFPPCJ92fXNYHKg>v{133=b zWJVtQ_zQGog4m`6D=e%wJIRgFz?2OUhiw!*}W&IbdgT znaIWSl!H5bK-^MNdRz4(cQ0pQfP#)L zWJnz{)yPpQbsh$dCHiN|OqI+}L?O!dB40OqIw>LS=xkW#>HY32jDRtt7e>EI=6t!l z_FaN;dTtOoowJ#7Jz1M7i419saY^xENa%-v&u~reKsGuRi-4r)dfaY)s;pHOwhdl% zW~EG8CP0?bRacC`mNa`o*Yn3-@-wbzfumtyTO{|}S?Xk>>`H1>RylwxH;{=mMqx^a z9F6=b?79R%4K5>n+(S&zseR(kcfBcMDe66F$!?^tcmo@Q))gp#Q*^0aff4wfHclZt zec%cEsvQTZOAs#CxyiZBXp~u{732?pO@C{PzQpw{;x4$_7VG|a48|8=<b8VTK)qlByY@XxYx}7H_lcX!%SJ?Aaq39!1Dnk z7#e2nN0n6sX3*YpzStk(q9*PI9U`t6Y-{SfXp?m#;}<{p z=~%U81s_4MHQDA}9-6U2Em+R#2&)N$`r4)CaHYVi_>BJ6OpI-Rr9ET1^dIM}k~m0e z9k8YB7s~8Al4HOL6$OoXH<+W%&WZ@ND?o&3f5+geRChH|M*g z2@PvSPvu9!9`)deaYe47XehKeZwz0xvHt4kMn{f;>=daxdd`yu zM!xj3NgG@wJn+`B5(gOXw)o9st1xtcVUX*Oqk#$wxp#7?B3#3c18qve#34yxq}abZ zYzp9+c}REJhT#-WEZ$F8Z8^_5ECJI$XH%(yB%!duZ60+U@L;>G#$TG$j5+TJWMrHQ zV4)|}DJjsf3iQ;tdDIS1E{H>+;@R(G=NwY!xq@I^?$~SSwlsLcJgWM4{d@O+NSE8G zxN8MmY)*^H89=`9?w_SQy+g>N_uv^|uoQ#HfO}9#E+U4TGWXN9iN6c{)KtL}mU)Uk zOc~B^oJ+}bsuskohTBCuTUiL=n^@oJdE}aMlXcdoH%QX0(rC-Qy5})4c^@!XLdH#o z=Epo(y~JwBao2F=OjzvIU?TCF)o-`EDtknMx>W5ecBqm(Sgs!SLf!37neRHmIR+@v z{dYY;XB7ZXP*0#5-m={mnMjW7>b{d-jhHewATy*m3)VJhOEyDVS8j9sBN_`8PfXV$ zpXpG=#x;+V71g^dw5{f2HX+Fn^E{Y;M(kg z$3bz*J{=f?`QpXCTQ#dyZx{L?oS6g}H)=utN=#;^u;yiX2BLpCjadW|8&>|{%93n-Q}HMYcN{mL%FEa? z`>QsK)Z8%(mFj1wMn+`YTfMMMxJ~*$X-)TX>O{T#9k{ zc{ot-g)jB{a|~r3jYx{g-`QrY{>_#0sl+HS43t&SmEk2ZudH$!La(mp$Z%@d=>v@e zFE%ZiIDa|IHZ$@*vbt~7(1aX-3{-rv?hb?v*r?4a3{^J3{YRNOD|0=WD#+8x*`V!*4dcs8{$E(DaZ#6t&=5RCuPcaD^%;5O4o9P; zhX9#5`&O2u<403h;O6}me(i@R!5&yT2YkCv5AdxmdgHu-^$71$7=laU#RmY~=Hb(C z6h@rD|8k+lkHP+L%-^_!-odJVVtk8jnf1-l_3=hO5m59>E&r&&x8G03NA#R_P|DlZ z7>%?PiRXyhLOSYcK+?8)|3(<-tR9~nk1fRrO6jXTyWI%`1D+g^LK)h6Y9zhK?M`31}R)H`b5c+qB2q`h83PmtOnJ zP4WGOTpc)6WS~$_D+!kYD$aKg8N8X`We~l6eCEkPK>_;X0|)bpjUNu%IS9>Yj5tas zWoXYNO>PdvZP(u$t0Sn0cL%Yi2H%(o(m#pA&UGu+bOZ@Q z{}?2s#m%K@O&^}wAnBN=DYB)iM0fj2ouvo_ zvA<<}4&~Ce{sqwpspty@V#X&9%!`Qt;K~YK$;Hdqr(S+UI*$mZ4 zmqxk3Oi-~fYaNRaLF`P0x6-_eBIO*4xBt8x9jeu*)i{8p_yTO7v-e((B(7TG>Wqm%p{df zGT51%3}K@wxk0m^_=EPeuCadwdXd+5B1T(ZY>nCbI}}v5=v{ZVfyC$(89ThZ+N}Mr zJ5DAuU~`tfoMEv26-kN4NrWGwSJZSlGom3e^q$vOEKq^A(W7qR`~=Of_Y&+0xU%9n zv2LK7y+m-UdV|3y^j6+aZk|s(_lr{M1ROULwi{FsM$(}kqIV~sOwbPGg7g;uNC+OH z>06SY2ueC`cGh%mI}gq~{gRPz4)tvgpP8V&|LbpSj#QxQY(&-@hOAp@;nR`wj!m@E ze1V`zQ*I4n-_L2nba^6bxhi4zT}AcLnUWmmOIikgBbE1@ChZF}_pCAl4=RHZtO-hd z3$QNjCI#_d?RH@!(;*@=a+O2x; z_Ww=FJQ;)gW|EOINu$nwRq@MRYzD!Y4HSoijLk)rxaV31`Xd)oPhllb8#V0!)y7YU z0Rx`g6a>YHC@ljWD}u|QmnP1=!2Iam2dTaK|gv+`0QmVt@&{3(b)uxHyA-YI#Z7ZBAK*xa)lUYh5zw_`jsiuR`CzTFr)ZhI(A)#dw zw>31gAW9dB@&RVlEweeH>3p0&oLU&J(h{hDclPb=H+|#wtnF_DrSmy1Dq02r!rmKe z@EPi}XwRvtC+QlWGvZ9}y12GRI6Y*wy~UMnu#N_fK>ZeD9!2Bx^1QHzs2HWO7n z@y|E)zliMWm??rsGc8JiCZf$k#aF#7ZZdwKnK@rYUS-Hz<(*P$2cRKATk;n=Fncik zL|Wo1b0$?aUVEww;@pBw1i)dwG(&gktsM0rxpCB8ynDWZkTEd9*=LEmhR>#HotSBv zoH>6=@Wi>Z-Sqd$J>&DjAVdA6*`^u9-ca$;uI7;*lM7)0SSE!}1T;%kk37Nzu_$Bp z-hx-CK@%s7KSr;$dgIsKJkm7cBR>COY|I`s1MvsNaEimmeT)kvi#B*nnA5YGQm-^! zlYMHPnh)#JN#fC6NT-n(f-(o=7R#2U=Rxt!rGx0NN9`c1D zcSOPgmifCah;m@dlf6stkK*!hvpak$)FE(h9l?!$@1JRIFe>H$7vmd?%1n2uEBUI z>z%X<;5TL&CuV@=#mqIVw^ECw3|HQNOsPiTWl@KM+NM9BL4Y3pQ@{ULY5LZsae!+V z@5V_+`-|WL-jkcYK1lr!)&NAu@flzUH408Qq-Sbz{gr>W0b}tHL2L~>7*7^lO<^5P z!rd)0Ld1!b;fUE7YEg7Gexf=&2qM-)n)socW`i%5A%1VMigSG?U-0OspAM6{qGO0G@x^39diP)s>+jc-0qV$ z$7fcXDTZ1SJA;fbwl@59fC|rc9~3-sd?tdp`k>S*|4cU|j_HM8uQKEH3eaS2sy7Y~^fsQ2WP(6T0RPV}EF{U9~@4ngaoOFac24u~LbQP>N$dJ7RL)3>2%b zWpMwm-(Rm+$peZ5Xk67KfR$-dMY^HBJsUF0G3nuJlR8PWaXQd}A~(AJ{rk%`*f>DJ zsQsGCicT^boi#K}NuD@@!^%IP(FS#!)>BSu9)spfSqsPhES&a>k@cBW(9fxf zq_darT?^EW!-$sXEmzjZM*uA_UtXV>W0P!Y%my_78Q1WBjZq~Pm`XviFB{njX2Af(k2N|}=a{kvPv$znK9B`I@JRW zJ~|7?6XBUn?`zp!OL|gM_J)C4AVkJGU|Wp!z523{&2WHO`RU2>MQ6J=7W$9PMCA6i zSOuQ;ddF6jT~qX2`C4mSum&uyRtTMzjE;(*z7PDoNyotHD zN1FNuM-T_q&Ddz}2SE^k{>L%zt-4Na?>yp>0YR+7WD@8s<8wQMFV+ak9Jir9z;6F@ zBm5UREWJ+S0LCRZKYb!xtE{T*hWNg(HsC=LcKtS=K6B5M&Nl+hgev@0PAUjS=GM4 z!*swr=mG%fJQL^E1mMHuCU8D+r3ueWD?roOo(t|hZ_EZxDnN(k2iRXvci|8CkEB5N z|9m~77Xf$ zQ=?1#77viV>OWF~SzLzjOp9CSq+z`f-k%9;7&J9zkGR12g8)ZjVeL25oovhoL=te9 z-J8z^A(W-f(gBo)Jvn1Piaqd-2Aqaqg^S&r1mMj$hIj>cD9Cbq{_ZkR=Cz;1{0`Co zFiwb%ASgHc*9LR~zTY_J&jW&og4l((Z~o`HfaQOB+<(;Sd|k;TAGkXi%$qX;AzFKb1Tceb%?DtGpL`F-ypd_48_DvexX2^`2s6cPu_ zJm=xedICWIhJYAltxhznwslk`vre@DEmx71IWq!86+laWs|9o%|`_10j+5o_cdmJHW8jL#d7wR?>VvarPkg^2i7A1S9gX;T^xLq&@wX4 zc1`lPDNKI`odgkZum9_c!zf49-4T^{N| z`}4B%Im0z^r%~45BOfNKu4h!4^0;}UWJtp-)!T~q# z;Br_f)Yv7zii{;(({$9bl)G29UGNZ(7IF%MKK*LU1_;tl#p5CJS2ZZ$0^A$tM3RD> zX}G!#uy8W1z^^t&N_?jD{7HH1$~)FeE_+({^pCHJ!#V)L^!Y3OJ$k6r%5RKlclKXr zBRq^5`iM3^*8J3v%ic(c=-e8qxNH9t{qhm|ce8^6#}CiIx|pSalOgN@q6Cn=nq?li z^bd8soa;ZD5Rgjo5kOj}+%gsMRk=Ze)|5#XN$JlvG@W^clDn1r`Srwgg7^;tTm772 z#MzZHTx}SN*A7G#pAl|06QAzW29lA478s?Af3#5D=Z?tG`iT@8FM-AFvnK&CZl$$w z1@GFp@+R91kdyL80F~B}-F-^3<0F7vr`HPivDO{luVKsvqLhltfLHT9^PnBV#`llH zfBfV_@i#0w?7s`N*o zsVHn#p@&oe+AHVCjB$bhKfNe=ZEUVVYx2=YfW99(Ac1vQkP3Y#CGIE8M6d-J;1TDL zA7P7@L`JIw$kMGXU=x z;qe;>(3J_=Q&h{zB5d}=%IGlcF~A)vNwbt~3EhyxGs?}ASGD#5xgtpTlrACO{R6^f za~v-u0cv$Z%KiS+(?=IJUYMK!sAE6eaz^VKK^th(gk6Auj`94SY7!6vQYJS6tZ`|5 zuA#E-&G&B#Ew)ippU1^rfWSRg9X%O@v(whq!qPCLD4wOR1^ zS&?p_CH>tOehcMg%s`BYn5`Y~7|`Fi6}SgLVqoj@SC;^XBmwvup7%UN2kNZ=ARd== z+$2x7IA2H&t#PPv2Wlxzgx5KNZP_?YGl{WCYGFbP(xY^={RhCSw$34?;sDKhv}=c_ zNknP@h&1j1ZUKLgYx+8t3m9(1K{nNhdn89=d+g9%v41^5-6@TrD{5k>FX;$fYKrhL zkW!@_1h2Jswn3iMu0%9jK23UYyKJA7DnzxlkVJ~Q% z#@i|sz(;nS=gV=7n){&}5+WH6MUn*l;+n2KAky?E6{O6S3%$)}z^s603vNz5#q_DR zP=XcDG<>MD4=nD7J=Vp+9=D-()cyxM{B7$uhay1gNV&&u4Oe8HY_MDJ*svBUOC-I- z09m+NlAy4pD_#G9muv* z1BSe#`S?Do18))XwAlafI`4~5viTJkol(bM`XT)zAA7@00lM{#Y;xZ2XHigL%jN_cB8USG~D7R>kAqb>4pPP3s%g I*YCyuKbNP?0RR91 diff --git a/docs/tutorials/gluon/wrong_val.png b/docs/tutorials/gluon/wrong_val.png deleted file mode 100644 index 8feed9034921df1d2bdcd8908543b35ad6568521..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 15438 zcmdtJcT|&U+b_zz4l<)6j1FRhpoU(RCN*M%fPjjE)Tm(SAcjsLV+BP(LJgs)1P}$0 z-pM$E5E6(4q$LoAKnyJs0wIvHADr*oXRq~s^X_y0J**`@33=*$UB7nUd3@2qT6)L9 z9TE}}(ombTmn9^&Z2;SSzibD7{}Q%S9r*8O#A)c2Uw|LvFFr}Y>tAo%xFRGZcJA5w z_Y-9zK>)mI5`OMlxKk)RJnF`sTN1%H!fywKh6nlIREoTH2jL$YqNJ~*r*m9O$uB(o zwyCb}|MLNz&^x}m(!nVr2?-?$=vk{P(fNxLF*J{9p?H~>sC4hBbo#~1m-l==`m$QN z0sSgK>#E(a0n)+6$@@}D_HfIxN2BgteA0OtcGfUCzS7h7nWWX8J-_{i_P|5lA{_f8 zZo!u@n}wT&as6$DY2D^aD;UJA=}7*3KB6eFzxgdOAMr2X>Lnz`-g(sP|JahyNPJ@X zW9z`_bD*t_KX$!%DYLb4>e0mksjUsAGq1jF+uC?``dOYUzjPflr%UCvvAuMq7E0{oYtsBm6X!nsybH_3@-ip8bY$|wFn%Ciw z%+AsU{mS}+@i~*j5&;Tw@k$>^K{5Jf<0O^bu`>8t3MT$0yefAr0=`xoz&xHzS~vxj zx7)C#`~pQ~al^0bKg>h3rGC!*NQ$&=pkqGw?{pI-R=ItsmVRDe_a<_?MBOfZcP|Pa zEd9*Dnz9W&nKN{Wavr@qd*}=${&TPQBVpawD#X%<*Rwx=zKy|AmF)vxa3RlzoG7$? zw)m1ox5ZsUFB;m&`A1oyoS2^_uKWUAV(V^n6$7v3Z_V_$eb*PR0A4LW|8ee=3-l!# zR+NvZ1@{HK=i<{1%1bDr{{78SuL}}m2eWwxn6zC`{nrID57(#^&IWXpFF%G-+I0)fZHXnec^E8Sc;z-bwJ`F;a+LZCYPY2gbr59-nc` z`FJfJJ9b6i-49-c9TUY+dh35RthLB6n8l9y!XtAq@m2ZK*WvMKa(pH-hXs$U$3$rB z7i5tZ7&yX*9Qax@#sUt=n}KdfceDNUsT$-O)^sq_r?t;+jV(p0@GT^V8`ZA#EuaVu z*o+Hl2DQ>F!D;6HT7&2D>M`oi)@-2C%zFKoi;Yf&Gj?7s&71ax@j;ZO#Y1U85lX%-MQzAT$ZC;4dZyglGG<>DC`xg2f%%+bCmc{35`u63H z1vzy;|KrnSf>15fhhNel9DqXE*a%6yEW`u$s}?T>^S>Se%a)-m>DgWxD5 zea&0$%KpK*$Zi557GggfpM{AxBXguMV@*DX5^ljm^Ts}fTUI{sJ|v;>*lgoeWS@g5 z#N(rHtgWd#eR^@vZ9bP*zX=9mWF(mmkIO_%E3x>BmD1awx+J4 z^jYi6Y&Kdj@rl%8!Yl>Hj9<9y3q7`itFIIkze}@gvAV1Zg{c;8mg8ELF}PN+eO6r- z?mc>Y-q@P-d{Vw@IUcvEoRTFzGVKSQaEiH7le!H&p@N?ftE0+7%%F}0h$rQDab{fv zni&d~&I~WbO~TjonI3{Ow$^;h-#VTdxKZ{%L8^TBGZYj!Se+h&fC7bZ7%t1nbj!-c(JPT*9EIdmYhBx?g9-RK*bjq>E zSQv9Q0j0-}kAIHjx4BQvs^M^LzlS+;czV#izlYSS+k0*mpJjh@6=`8Nw7aK>zT`-D z)Od)0Mt1=@^le}TmV&2(>BTohP?S6oB%ip?J@oi2$~@kd(m2!j!H(0tY5~nB7tLVf zE!4%Rm+iR4nKVKHX`#Enxr#z-R!d9RPvuW{CA(~gapZ9@B1~!WC@g03c{;g5r`)NKH#2`lO#NRo-Rvr_d5##-`Znt9zZ_sb@8AE7Y%(GTuN7=PEhH)lT?c9vr z66B%D(-HDxDrI1nxCj?n&Kt?b`$eD|k}dToi@fY~-ek>~Eyd}qPQE5Y?Z;M3_n1?s zJ`Kg0l{8FL#|&4$7D%q#gvJ-gpd2iX8~$u4Q1KTIZ7woUyY8S9GGkGOd{E*(+i`Q- z`OtDfgl1UlQ^#zS9VM@vMb{9fK3s5@>Fa?({)?gv^@ zgDpp)KU(`2VA}9bbL|eC#bSJRsBJCjcs4WcXCiE?0A5vpHWE_K+p(xG&CEG6u(_?Cd(G55uki(#m{E3v zz>zYDG|zlSf3#>|Pca4Vi6DlAp&j~aVTSo{TpZf*4ak|eRGgfxS}~nLRUx!peOY&C ztz>}3D`qQ%Qtx#e>3X$AU*&WaZ=O02AcQ7Mmfw0Nv4@_OdlO#U7xWafn8nmbcov$$ z`JxW%GWmuq4Vtv!Gs6mn+X8O2}r>mRKSz@9l$U|f zi?_Y^h}7J=528aN`n#goiA0#6ahD#G=3BmsQWkG}${?z))#D(gL_0LvIz%R&+lE+b z{1hjI0mpX zj5cTa>x=!(KsCRGibQH&*C_YarJ1ym{^ppO+17u>_%BBohT#cB6JDl3#w8Bp+U>V? zgFwf=Bx?}(mAw9N&La~#mZ7>1|r~tuVs38l|W_;o0*=7+~f%jcae?0j*0BEp$gxH|=I48}Gt%S-!N%5@@@QiA)z(6u|#z{eO{N$YA8TNzD#pffR zYuX><FlTADHYI7KPSTpD9 zNyQBOpZ$%CWng(;b>$4P{UUZ$4vl9Sjkhy2<%metEM8Oa8Ib%##l(=ej~2h@y03-7 z2|1fb2zQLFS8TQEzjC@43GiV%Q9o-h2co0xd@D0AXV}BVX$^Zb1RZ)b;yF$eOy}K2 zO7ijj%`TLkst|*GFumgX8GdW8|9p1S47T zCBu0Extfh>2k;c$@CczJ+&1OGq%q7{S+z0~s&w-&cl76V#W>Pl^kkCACGuHlq`n>H zv1E(HwO170PKj4)8XKoRD$5JKGhEJ`3xn{DO7Al?vv}Z@wRAi}oP|6@ivsuMRDD|j zqXu%Sw$+?IW27D6^h!{MAYP?h38KN0hf)Ps(Z~rmrGm5_ZS;ILn(cu6Fu|EwsVKh4my5+8Cn9zD zUws<{X-3tjsn2iVM@?ho(385(Ib8p{y@(g9jKO!F?g774KgkZ|Os8dPz?d}Vtl5m< z4HB;)9Tnn>8I$8g!B&FH?qp`>>`;&Jv>yw7((Eb2RV&Wt&E0o6}>>s;7srj0|~<&8i^Mn?O>KL|b-Ac~BSfuva# zPS+E1{F-N@ojYpFL8?*TjdH5i?d;eA!fPGJ{^Z70Bh!XTS4PqFk{j;w8u-&}uf zWxLRzijD#|UpH^+m!nJ?+s=dM0afS=dT5?CdYXEo=+Taz7POc4G@>LRE8fum=4~Ge z^lK@HnYI^PphFK4)Xn1P=M8V+)q6~M9T=gt=o&lSTsu4$d5hoRw_ME77$d!9tBhL*_3ywx7OtJ-LF2}^zpmU0m|Lz6A~uQDrA6{-dfqwJ^T2277m82-RKmf~wI_W!q5(Wzy#Ro9$+(F?Jo$i|(Jk?~g4h zl<@0K7|cxvv??iRClr#&eU1o$*Lm<5?1>nP-;zufD_(`(Fs@FHvnMz@^%jrvU$-Kq zs@dn#7G3fYA?b)ENhVF40n$EiyM|!WO4>7G|U9`p$<@<}4R zAyx3*4_)P9lzWsFmwnLJvKbNeNHEblTw{0QcaQO^87eak(JvQRq{~k&=|~Sx$7t)f zJ?DoF*tPrNtvV<|9=GcqCW1m~%`wi*b@o2EL@7E-_nE@@O{Q`_Tx-t{-|mtc#{lsy z)M5mti)Z-_cyXliE|Y7W?VPCQ1b;Zzdgak;mi=J|yNJtbE8Y=B*ayb5;+`6GboVNPM+;x zMyiW%W1P@mQi0kdA&otCZXN*NZraHg9a^=i##iuR1!b~LV~ffcBoNaKoWgua;< zMGfkhHwKxZQbJT=kM=S&W$bp8p+~TsndlsWrO8E+b&(V!F;%O(N;%%K!?p7Tp?bMm z&_uPEpXHE2Xhu?5DYmZVh>&cIf)=VRMZZ=`x&0`8uUKYA zS1>5oWv#5>2j>x^_BY#5&N69dts%E*p+*uwGY}?S7v&&|rwHcp9Oh7w z3ZLwP0jc6q`B<(#2Lm=?K9cXvL@Yt|%^WF3jI`EP21-CXLxFP&yc>o@k?(MONkR7n zQNjzNlXSZdMEj!wNk)F_dr|{h_d^tE;Rat=Su9wlIf}afn!e03o*CW#2dCLc?;{9M zuNq=B{!fUNX!oR;!B&jdhAnSE_-0Vz44?KD?S;m_%F9BEq%`#Q)jUb#S)3b)9xIQT zmP;|LD)HU}(m{P)zk*IP;iNqY$yY5%*U&SHL$lL47F^v-@ry5pP#3%LlBdCAUX*fz zuY8YjT(k$pswAOt&xkh%a_S`i>yE*dmH<}Ma2$ib;Gl6LGCH8SGUG%6W~lCOgLDd8n9l<6ws`n2!N5N<4hVZ;i@b04L1xbgg6suW>Od`%?v%=_icaCT^w zuv$<|ZVs;em-kE6m0|l{L;zZ(#V<2y(79cqtM3r|B3K>vqWz*s>H(%aI4lRZ*gHxt zQ!~}g#OCZ7u8z1|h?vpDG;{psw)}`~&-8A$*b6W>pJw}+xO?KWBINo#Bvsh9d6$!B z2Ma1<6=r_T2MfyppVi-5zz{dm@)kbqWPo7e3_)Ms(1So-rDQ|ozW$V$=8%%U-6eHa7oJ*C;kdmmD4?8&9S$9WH)zd zuDUN-Ue_xNi9tNv(E8UO!?)mouD_u0HUE(71nv2jkf&!=<>w4NF2Dd;M=?T&-E+Ux%phdFU+z_>O9K1-_b+yn80C)WQJOE##zq}{IgyET8r zxEJ4XNwk{6k=i^JUo4nEGz0T^bCuf{^aP`=SF1}}@YZh~AUOv=jxb@a-R_^bar~iP~nkQFZ;WXYi_*2;NYRu?U41DOaJeW2KeVOAXH_i4a#%H)O_< zKXQXVvdM?t1ij+L|G_B}%(n>|kJnn07M%34{_vp&kZoY# z^=I1!bN^@EhGV1hYe;~LK|HStt+bviLj~lGJ(1{RvXnX zRs`;ay)6?m1FM?BmkJC|8AA`*pUkTBH5G!}i!Uz^z}HGKefqV=q>3EO#8Llv zC{GoyCscQhJft5-AkM6I59HnY0T`+D#^Hr~UXJ zhHn*#bQ9xl6XPxuCK!Y3Lbn*-+Wg!StA$Dxm#Cq7VG+)veDRruZL94v7e$%)xherD zynlj~zezLUmEefrU^-6vnhw+BO&i!gojdKbVFqX=1Fgs1c6lnSP4qa_&5B}yew$tQ z1Xr(Yp~)Kc;ZK9@Z7D1@|9r4&xu-^ZF`6Y*j#m=TrxQTe4t2)ZPDLp9WpnrPK~|A= zQ}t-RBedlr#TLCm5FgdoSmnEb6+FId%AmQgamnC5+frU&fg2(AjCR)p|Jt6+xW0>c&_V*IATdQ>!3EIzQ+`Qj8(8_rSUFK z*cVha6sOrEb{o1%@$GNs55(e~ZjSE+#~<@=0#qyRSGbWwf>eBMr$KM2 zRQ4~$tcLf;Uk*8F__~l-G+ne+^Nj+;+`2%wTE^z}JkSO5GqCGko|$+|L{4afUZ)A~ znn_Xc>*2TG}O#J+&A_o;@0##su+i<)np)VEg`Va`#yx1UgOU z*zXHbUq$qNR*5qT@9%qN;7pN(uW9SM_mhlH)GB106REQr;SGhDN_7_FD8W;el?HFf zikZkr(T0NT#`WqKwxVlSF|Iao4c?G}sbuD3HCQoFlwqx0Uyu|ut~`A24E1^bP52^^ zb3n}qkbUiom318&&!9TzA7RaMTSyBL`kS}zB9R$5 z;a(^rgRR$NVqE?p_)^-5t0MI9Gp-E324&rG=Gjj`f_#uF8=>JeLtcg%0l<0dwV;cS zS09y{17MS0<1Df!h+o})#P<^60qnFL-Q5R=cK@Is38ii09EHEB=%UJhA}2_i&&h|p zHw7|Rq<~|pq29?1{cnt}3U$>K8@U2_z6@W(U@EUnaVAJ4jB9Ilzx&-)$K$n<kyVlKfUA{RctnWXYnn?IcN^3WJmEQ0@fqi) zDc%lmxQ9ztws2Z(8)`W?A@#^+ZYqB>8wX64oT$}FL2i#3a^quNt)V>VMyuf37!%q; zCksK_c&6<(XgXU!j@@gtE1^)2j%SF-&%fM_5X`zoHzaG`MFgnbS z^*dXcBUXa$EkHPFWO}uif98nqrD0z8p-Sj@JG^`P*u>0TCZx;SYo^wK$r z;QE)SK+aq!yufLtKc_BB02xk2i@;(c6z?NCNT||q%z9Vo2o_$UG=de5u#W1_XN6kn zxBf+X>j!Rk?}N0M43ms>ZwPy;oQaSEG1k^&AMtfF#~6SGp)JK#xk5hWO>?}G>nseu zmL+gfm{zucomPBT;iieQ^IDF>W<8VAdNbUM|7qlptV_u=kkb?;efReyqx8B;94cq) zkNb#_eiZffwv)_pk{x9?x+8r^{jn-^!}COYK28#CL|$nQ;Y3{;FPSuRqP)6EJnK(8 z=5KSNJp;!h_VCuw&e^~mmO9!JP5=73!#mIV8=^8C@0?J!@mCrY`=#(6y|3yS7$Q@1mWJsxcq= zW7Shch30AbU#dyPJIA>XPrU)2L)Lk0#kRWV^h;$D5{HVm)&kUax{w05tViNUM01bZZ+{|Q`7ncjf$QpT#%8R?>$iG#OkJCs zo{Xq^v{h|Ju2Oz>QCwbj-8D)ny1~ZFLfJya{~03WG42r9(+}?17ZkIipi=>+FZb=? zZ+rXR4!wZ^ZCsDwcb6ZmP{VKQ%dG1mtvwsUd0o!nHW~wjQPGYm0E^#z+G`K3O8xA^ zFIzY!))Y0tOP#6$>IIG&bzvH09<;@|qffrqDP!{w%&N`O?uz`n0rRsReP14w;#|4| zY6QvzQp1H_kvnJ6h91>#K^x-EU9r!G0HEQN0hE(A6b_fdj@<#$Biy#Spj*FpeGcFo z`?W^=5zwe$Qy&~{Ly?S`6hB19l|^!bpP#qcI7RW`7w&U$$TqX<_%v%Xt`fuMfD5%) zekJ3+Hl|jhko53YbVL0U;GGs*%2i2@FdkafBaSt2rxc@OUE<<~$G3{Jp7a6z z=xUtV@Z6hW;MIc7zAv7TJ4&nr#q#rHVg1_c&m6~D@i79~0~%%H);dnD z9rmhs6HOg__m&93;FAS0s1Wo>pmM~kHKWQB7wsHW%>``bwrgmsA6)b7r(o8=-&FJI z0wY~JBh|L@kk{NJwYe=nW!cUjzZ#hpFb zet*50Sr~Z7Es7RqEeneNa@XErr^MXYRTs}JY@rDPuE#EnTTrzW#24ttiHI0c# zkzj=^Gg}ZlS8jE zFS8`F>3oG!$CrCM-E_PqG;aR+$4^$sUiGq{!^njeh{0m(wB557g6|)!@|OlBUq#UV z(a=6?2`olUM2tv=r72oG(GidN7D+C5$-_MK+UaeT=rdHF-T(Q{=#50luqv}W$6wsM~OvNf>G}@$>j>y z+Fqj8r$@J~|8YUp+{+XZNc26C#?@(ExIFBO;3viZt|F3S0cU58KVCGjTX*iT9n#A%~< zh}!qpUnDB_+80GnJ(hD1ZHMxTk3Q^13Q#AF5VIsD%gu$_`zI%I92!F=?!_#NxzVCo zY^YZ_|K^nF;S_l?iXLxun0M}SsL|;4`$`UtC1N5>1YAbr%xJau{q;|0WTdCKhO@`J z5AP+l&b@wfFM9Od4J-5L?_X|?olkkZ$>p+AoLZqjRC!>qE3W1FmC&^FgCm!6`Av}zkG$V?>uHT5DnQ3j;gc_wjpaNmM~>xSSr z;{&Y=HAFWvv*7WE@?QHpi;x_sEE^a#tHY3+F!ivN%JsbG!--DBC)`ijz-fP-?HZZG z3l~H0pA4CdASmZBN~dRIJzFD#oh_-ROaC1v{yS3!bB{?WgkO}UM^C3*?fup5uOB5j z(HbQ)^>I9)e)f~OTF~le7nvV07Dh{$FUj%hwTJL%i1%#(1-1Oue8wFCfIV2Eic>tkY{ks-pwyG9D0{h5;bMlg5MV9gRPAZZ&WKip8LQECk$dBm~341N_JOv^lJgi{b*8}$!zXR;uh9g3Alrf%+f9PxIVetY~fQy+hPq-r`dn}n~TFn zi}sTB3q$I+ZXq65rHcE=JE!%zHHxhiQfb6T*8yBL13Z4qrJP%&+~bvK)dy1@CwH4L zp3%5B5*7B5Hb?LxxlY!~JFtDBdGnW86^H!MA+2HXKbM7#3iTVgz|1&_4D>H8+&p0l z;LYDUHLKe0$ii;F2YQWJ7`HvFC4Ttn0sUUGmK&`RC_Wk&pasuiUp#P+N)Bv0w87s9 zsXRZ~9w!UB{pU~Tavam}8qn#FDMWkY;q!P}!l8@t06{p)7Hx@+g?yrY_u}#C{lVmR z-lY352fh!u?DquKqQj)==uj->*0sLK??9=<%CZ6F0hH2C5x_kGS>~T#`wy004W3|$ z>W6{RxUMMPHvXt7^Z|F3w@t=emJ|n|)BWh>sV;jj_8sD_&BCY#yuuG98S&`fLUV%8 zA>pWL8!I| zhl`I_wQ+0*nl2Z{%wm*8(;)Hw&K!?BP}%NHyUKs(&GKYx-eAHemA-x1$jR(AuTz+@s z!<|9TmhfL^MEM5PaJl1Te~#gbii|CfVQ?sHP+M(t9=HH zM4Kz?2%amRql-nn_ify1PDwO?WQf7#h_Q(%xRev+Hpk88n;_gw?X;hm{o^DJ1wQ_*ED?gn{dVj;qm&-UZolqDQw;t3 z*F&I4AN;!a-pP>p12V05hj|}BBE2M_-xWKT-)U5bZ@#`m{T8)EGfA4haiv2nWXlT{ zq(I1v!z~v_|M+Peu%8nz4J9Zxti5lv1ki`t!7|~DF6!5*Chc#!C_YgsmHd;wZ-0>O zzDrgp;XEw)Bg^v8l+nl!iEq3HnR?s0wBu&~iZ;6x8?s|cyODMx$t+^-_s%UTKefrm zLx6%Or!F>7L#zbL-5{y3N}2krc9Q^w(87@upzg3kLidnn<)jgy6#<%0wDTSjhPicZ zL_m4eEZyKMbY>Asu`^8J4znO?_mmqu>h6DjN+6VA+3Ij(@&)%gze}=35yP&^Rb>EM zTD5ISudwD|4g>%Z8uwV5IUtEI<$=WO-N5HxSOD06AAsTx6Ut%>jx)~e+BE2`{rc$8 zZ0U4$)pT{gs$-#tL1j2dDe!GGeBjnJ;xx@Ne5KD=cYQ1zds_W4-ZKDM0b|G)<;1Nm z9O&%xKO^6`nh!xG{9qibwqA`lKZnsK_Xd;i%UI@O3y*uB*VENO)mb2z{nAs%Qcu3@ z8vcCkP(oc{xT}G)j(c$ZRaxHLXU(MUR0S+H0WdeorE=u7&6}}H&CGjEQS3>eqc+07 ze|djy$E?nO_ILjJfEtX{?)v>ut9j8|&(_%= z3jg{fpq`za-NM8YE*@d$LN*GbCe<~yGiQZ7y3ACDgQ z(e^O`L$PaW%TXCy0IuiyWLwNiUYnNiWfdX2#`C|bi!FPq|p$l3MBye#D_b9 ze%du9r5OF3z&{&P7{4|~3-{Q)O=`E+hO7S0sU{kBWh^>>Ro*KaaIS1f9ii$e;57hk zw3b_zf5-NRv9|Yep?y4&I&C!WcR(J71i&cE#p~BuPe5E$78K-nCCf{NnK1Va70zVu zWGBiUI@4+*Y<(UxmEbWoeWK%$U#h+FghA5arVR}y>(+fF@$jBb;2hqvTbe==vI*EE zv!v;c(W%&JU{EJAwVb7+{xLYg_0q*B_dw}7_0cG(BSjY zSky#BGb{Q6?HX-ZHzWfjhMd0DIjtz3bHVr8chAHC(rx?lbQ(mllSfCP-mm80{_9sO zj}VmNvf(M3uaw3yPn~Vscd4~305%5HnH7|#&`2nAeAJ+>lXEh4^QXX6MT^*SVw zeKK7FbheTMY^6&P2o;nH&7wkNi^4u009du%!29FVQlIL8rkg(^r~l&%PR%Zc{^*{I`TgDKG`Gvr)6jwcPKNqpSg z1QefaC*a1zm20j^%ju%Z7nioBJWf0Vg}Ctm{UEtAb;JNDwy`@00~G3JQVS9?qc*s#bAzd%&uQcXfIN5n(E`T8nH<AKGE;z#&T193E;B1q`SYOR zslW}u^50u$Lc}q$TRaC$P2A)ZZfh$3$FGY8!=u|rr@E3gvz=QO`x%6c`!Qj0T6OWu z7v+~P22_W~O#pU6x{+Z0+Ye_{mbmnU$_7AbU`={UMNJ!JPP9bqoT?vo(=?uFMlb

Xq;Az3(26gM;(w@9AlGPS-5C#g5&VQ|Z8$usp&iowaN|25F=k64 z>tF6`%>JCE2e>OG09h;VNB2BC@*`#goEPF0Z960cWeg|fsxbGK!~y>3{(8@`G?M*R z&iB4;OBKcZ%}LQu8BOgoKv5`mChyY@;Iv$738$N~Sjb@yL?6)KRqT-}SV%0;huWH; zup)(J{xlF6knLfSW4CUVr1TyzkG?HiM&_f3T6vVOU2&^(a-_hDr=hKj4N8`4qd;N< z2m)Q*+E?nn`C;c}yCX<_MpP|hAyCBiwuQa#{{wf`26JJtEI{RoHrcSQd?mZ72E$h} zfVGPW*#IJfS2rC#?Sc2@$#s?$M7j;^52gdAE(_}b!lUZ|Q7E+xJWEg~%Q6A%-#vGa1Ns)OJVm@+zdmUHtcyc_*TM(=(FRQ6TEudD?9&eDtK^PU zx;H9$-FPVXr1WY^hIXV_E&Mcs(u7lr-m>6W1jpU7D!Rvrt@pT_Q5Gn zpN|>&-8E1bj}Ppb0oE)G?dSywWPT+5(?2Ez(xX6nOpid{pMSW;I$7Y0m|I)_2lD@S fg7ELBV~DpM(0e;H(x2?OwGZ^1!`Zi|uiyJ$sh?eZ From 6fcc2018612ab5105170659d1538cdf44e13a255 Mon Sep 17 00:00:00 2001 From: Holger Kohr Date: Wed, 5 Dec 2018 20:29:47 +0100 Subject: [PATCH 09/10] Change activations to sigmoid in MNIST tutorial --- docs/tutorials/gluon/mnist.md | 50 +++++++++++++++++------------------ 1 file changed, 25 insertions(+), 25 deletions(-) diff --git a/docs/tutorials/gluon/mnist.md b/docs/tutorials/gluon/mnist.md index 9845fc8d95ff..7634c15ca676 100644 --- a/docs/tutorials/gluon/mnist.md +++ b/docs/tutorials/gluon/mnist.md @@ -70,7 +70,7 @@ val_loader = mx.gluon.data.DataLoader(val_data, shuffle=False, batch_size=batch_ We will cover two approaches for performing the hand-written digit recognition task. In our first attempt, we will make use of a traditional neural network architecture called [Multilayer Perceptron (MLP)](https://en.wikipedia.org/wiki/Multilayer_perceptron). -Although this architecture lets us achieve about 95.5 % accuracy on the validation set, we will recognize and discuss some of its drawbacks and use them as a motivation for using a different network. +Although this architecture lets us achieve over 90 % accuracy on the validation set, we will recognize and discuss some of its drawbacks and use them as a motivation for using a different network. In the subsequent second attempt, we introduce the more advanced and very widely used [Convolutional Neural Network (CNN)](https://en.wikipedia.org/wiki/Convolutional_neural_network) architecture that has proven to work very well for image classification tasks. As a first step, we run some convenience imports of frequently used modules. @@ -105,13 +105,13 @@ Unless a network requires non-static runtime elements like loops, conditionals o For details on the differences, see the documentation on [`Block`](https://mxnet.incubator.apache.org/api/python/gluon/gluon.html#mxnet.gluon.Block) and [`HybridBlock`](https://mxnet.incubator.apache.org/api/python/gluon/gluon.html#mxnet.gluon.HybridBlock). ```python -net = nn.HybridSequential('MLP') +net = nn.HybridSequential(prefix='MLP_') with net.name_scope(): net.add( nn.Flatten(), nn.Dense(128, activation='relu'), nn.Dense(64, activation='relu'), - nn.Dense(10, activation='relu') + nn.Dense(10, activation='sigmoid') ) ``` @@ -148,7 +148,7 @@ Internally, it makes use of the [`SGD`](https://mxnet.incubator.apache.org/api/p trainer = gluon.Trainer( params=net.collect_params(), optimizer='sgd', - optimizer_params={'learning_rate': 0.02}, + optimizer_params={'learning_rate': 0.04}, ) ``` @@ -232,10 +232,10 @@ for inputs, labels in val_loader: labels = labels.as_in_context(ctx) metric.update(labels, net(inputs)) print('Validaton: {} = {}'.format(*metric.get())) -assert metric.get()[1] > 0.94 +assert metric.get()[1] > 0.92 ``` -If everything went well, we should see an accuracy value that is around 0.954, which means that we are able to accurately predict the digit in 95.5 % of test images. +If everything went well, we should see an accuracy value that is around 0.925, which means that we are able to accurately predict the digit in 92.5 % of test images. This is a pretty good result, but as we will see in the next part of this tutorial, we can do a lot better than that. That said, a single number only conveys very limited information on the performance of our neural network. @@ -253,10 +253,9 @@ def get_mislabeled(loader): # Predicted label is the index is where the output is maximal preds = nd.argmax(outputs, axis=1) for i, p, l in zip(inputs, preds, labels): + p, l = int(p.asscalar()), int(l.asscalar()) if p != l: - mislabeled.append( - (i.asnumpy(), int(p.asscalar()), int(l.asscalar())) - ) + mislabeled.append((i.asnumpy(), p, l)) return mislabeled ``` @@ -338,16 +337,17 @@ The LeNet architecture is a popular network known to work well on digit classifi We will use a version that differs slightly from the original in the usage of `tanh` activations instead of `sigmoid`. ```python -lenet = nn.HybridSequential('LeNet') -lenet.add( - nn.Conv2D(channels=20, kernel_size=(5, 5), activation='tanh'), - nn.MaxPool2D(pool_size=(2, 2), strides=(2, 2)), - nn.Conv2D(channels=50, kernel_size=(5, 5), activation='tanh'), - nn.MaxPool2D(pool_size=(2, 2), strides=(2, 2)), - nn.Flatten(), - nn.Dense(500, activation='tanh'), - nn.Dense(10, activation='tanh'), -) +lenet = nn.HybridSequential(prefix='LeNet_') +with lenet.name_scope(): + lenet.add( + nn.Conv2D(channels=20, kernel_size=(5, 5), activation='tanh'), + nn.MaxPool2D(pool_size=(2, 2), strides=(2, 2)), + nn.Conv2D(channels=50, kernel_size=(5, 5), activation='tanh'), + nn.MaxPool2D(pool_size=(2, 2), strides=(2, 2)), + nn.Flatten(), + nn.Dense(500, activation='tanh'), + nn.Dense(10, activation='sigmoid'), + ) ``` To get an overview of all intermediate sizes of arrays and the number of parameters in each layer, the `summary()` method can be a great help. @@ -365,19 +365,19 @@ Output: Layer (type) Output Shape Param # ================================================================================ Input (1, 1, 28, 28) 0 - Activation-1 0 + Activation-1 0 Activation-2 (1, 20, 24, 24) 0 Conv2D-3 (1, 20, 24, 24) 520 MaxPool2D-4 (1, 20, 12, 12) 0 - Activation-5 0 + Activation-5 0 Activation-6 (1, 50, 8, 8) 0 Conv2D-7 (1, 50, 8, 8) 25050 MaxPool2D-8 (1, 50, 4, 4) 0 Flatten-9 (1, 800) 0 - Activation-10 0 + Activation-10 0 Activation-11 (1, 500) 0 Dense-12 (1, 500) 400500 - Activation-13 0 + Activation-13 0 Activation-14 (1, 10) 0 Dense-15 (1, 10) 5010 ================================================================================ @@ -401,7 +401,7 @@ Note that it is advisable to use a GPU if possible, since this model is signific trainer = gluon.Trainer( params=lenet.collect_params(), optimizer='sgd', - optimizer_params={'learning_rate': 0.02}, + optimizer_params={'learning_rate': 0.04}, ) metric = mx.metric.Accuracy() num_epochs = 10 @@ -429,7 +429,7 @@ for inputs, labels in val_loader: labels = labels.as_in_context(ctx) metric.update(labels, lenet(inputs)) print('Validaton: {} = {}'.format(*metric.get())) -assert metric.get()[1] > 0.975 +assert metric.get()[1] > 0.965 ``` If all went well, we should see a higher accuracy metric for predictions made using LeNet. From 242e5e4d5d862d4048d37f0fc053f763ad960382 Mon Sep 17 00:00:00 2001 From: Holger Kohr Date: Wed, 5 Dec 2018 21:13:40 +0100 Subject: [PATCH 10/10] Remove superfluous last layer activations in MNIST tutorial --- docs/tutorials/gluon/mnist.md | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/docs/tutorials/gluon/mnist.md b/docs/tutorials/gluon/mnist.md index 7634c15ca676..35fb40521f62 100644 --- a/docs/tutorials/gluon/mnist.md +++ b/docs/tutorials/gluon/mnist.md @@ -70,7 +70,7 @@ val_loader = mx.gluon.data.DataLoader(val_data, shuffle=False, batch_size=batch_ We will cover two approaches for performing the hand-written digit recognition task. In our first attempt, we will make use of a traditional neural network architecture called [Multilayer Perceptron (MLP)](https://en.wikipedia.org/wiki/Multilayer_perceptron). -Although this architecture lets us achieve over 90 % accuracy on the validation set, we will recognize and discuss some of its drawbacks and use them as a motivation for using a different network. +Although this architecture lets us achieve over 95 % accuracy on the validation set, we will recognize and discuss some of its drawbacks and use them as a motivation for using a different network. In the subsequent second attempt, we introduce the more advanced and very widely used [Convolutional Neural Network (CNN)](https://en.wikipedia.org/wiki/Convolutional_neural_network) architecture that has proven to work very well for image classification tasks. As a first step, we run some convenience imports of frequently used modules. @@ -111,7 +111,7 @@ with net.name_scope(): nn.Flatten(), nn.Dense(128, activation='relu'), nn.Dense(64, activation='relu'), - nn.Dense(10, activation='sigmoid') + nn.Dense(10, activation=None) # loss function includes softmax already, see below ) ``` @@ -163,6 +163,8 @@ As a better behaved proxy for inaccuracy, the [softmax cross-entropy loss](https It has the essential property of being minimal for the correct prediction, but at the same time, it is everywhere differentiable with nonzero gradient. The [accuracy](https://mxnet.incubator.apache.org/api/python/metric/metric.html#mxnet.metric.Accuracy) metric is still useful for monitoring the training progress, since it is more intuitively interpretable than a loss value. +**Note:** `SoftmaxCrossEntropyLoss` combines the softmax activation and the cross entropy loss function in one layer, therefore the last layer in our network has no activation function. + ```python metric = mx.metric.Accuracy() loss_function = gluon.loss.SoftmaxCrossEntropyLoss() @@ -232,10 +234,10 @@ for inputs, labels in val_loader: labels = labels.as_in_context(ctx) metric.update(labels, net(inputs)) print('Validaton: {} = {}'.format(*metric.get())) -assert metric.get()[1] > 0.92 +assert metric.get()[1] > 0.96 ``` -If everything went well, we should see an accuracy value that is around 0.925, which means that we are able to accurately predict the digit in 92.5 % of test images. +If everything went well, we should see an accuracy value that is around 0.968, which means that we are able to accurately predict the digit in 97 % of test images. This is a pretty good result, but as we will see in the next part of this tutorial, we can do a lot better than that. That said, a single number only conveys very limited information on the performance of our neural network. @@ -346,7 +348,7 @@ with lenet.name_scope(): nn.MaxPool2D(pool_size=(2, 2), strides=(2, 2)), nn.Flatten(), nn.Dense(500, activation='tanh'), - nn.Dense(10, activation='sigmoid'), + nn.Dense(10, activation=None), ) ``` @@ -377,9 +379,7 @@ Output: Activation-10 0 Activation-11 (1, 500) 0 Dense-12 (1, 500) 400500 - Activation-13 0 - Activation-14 (1, 10) 0 - Dense-15 (1, 10) 5010 + Dense-13 (1, 10) 5010 ================================================================================ Parameters in forward computation graph, duplicate included Total params: 431080 @@ -429,11 +429,11 @@ for inputs, labels in val_loader: labels = labels.as_in_context(ctx) metric.update(labels, lenet(inputs)) print('Validaton: {} = {}'.format(*metric.get())) -assert metric.get()[1] > 0.965 +assert metric.get()[1] > 0.985 ``` If all went well, we should see a higher accuracy metric for predictions made using LeNet. -With this CNN we should be able to correctly predict around 98% of all test images. +With this CNN we should be able to correctly predict around 99% of all validation images. ## Summary