From 7ab0dd6da4a9ebbe637f63933efe609c48eb024e Mon Sep 17 00:00:00 2001 From: NRauschmayr Date: Tue, 6 Nov 2018 13:12:18 -0800 Subject: [PATCH 01/20] Adding info_gan example --- docs/tutorials/gluon/info_gan.md | 435 +++++++++++++++++++++++++++++++ 1 file changed, 435 insertions(+) create mode 100644 docs/tutorials/gluon/info_gan.md diff --git a/docs/tutorials/gluon/info_gan.md b/docs/tutorials/gluon/info_gan.md new file mode 100644 index 000000000000..993f9fe0f2a1 --- /dev/null +++ b/docs/tutorials/gluon/info_gan.md @@ -0,0 +1,435 @@ + +# Image similarity search with InfoGAN + +This notebook shows how to implement an InfoGAN based on Gluon. InfoGAN is an extension of GANs, where the generator input is split in 2 parts: random noise and a latent code c (see [InfoGAN Paper](https://arxiv.org/pdf/1606.03657.pdf)). +The codes are made meaningful by maximizing the mutual information between code and generator output. InfoGAN learns a disentangled representation in a completely unsupervised manner. It can be used for many applications such as image similarity search. This notebook uses the DCGAN example from the [Straight Dope Book](https://gluon.mxnet.io/chapter14_generative-adversarial-networks/dcgan.html) and extends it to create an InfoGAN. + + +```python +from __future__ import print_function +import sys +import os +import matplotlib as mpl +import tarfile +import matplotlib.image as mpimg +from matplotlib import pyplot as plt +from mxboard import SummaryWriter + +import mxnet as mx +from mxnet import gluon +from mxnet import ndarray as nd +from mxnet.gluon import nn, utils +from mxnet import autograd +import numpy as np + +from datetime import datetime +import time +import logging +``` + +The latent code vector c can contain several variables, which can be categorical and/or continuous. We set `n_continuous` to 2 and `n_categories` to 10. + + +```python +batch_size = 64 +z_dim = 100 +n_continuous = 2 +n_categories = 10 +ctx = mx.cpu() +``` + +Some functions to load and normalize images. + + +```python +lfw_url = 'http://vis-www.cs.umass.edu/lfw/lfw-deepfunneled.tgz' +data_path = 'lfw_dataset' +if not os.path.exists(data_path): + os.makedirs(data_path) + data_file = utils.download(lfw_url) + with tarfile.open(data_file) as tar: + tar.extractall(path=data_path) + + +``` + + +```python +def transform(data, width=64, height=64): + data = mx.image.imresize(data, width, height) + data = nd.transpose(data, (2,0,1)) + data = data.astype(np.float32)/127.5 - 1 + if data.shape[0] == 1: + data = nd.tile(data, (3, 1, 1)) + return data.reshape((1,) + data.shape) +``` + + +```python +def get_files(data_dir): + images = [] + filenames = [] + for path, _, fnames in os.walk(data_dir): + for fname in fnames: + if not fname.endswith('.jpg'): + continue + img = os.path.join(path, fname) + img_arr = mx.image.imread(img) + img_arr = transform(img_arr) + images.append(img_arr) + filenames.append(path + "/" + fname) + return images, filenames +``` + +Load the dataset `lfw_dataset` which contains images of celebrities. + + +```python +data_dir = 'lfw_dataset' +images, filenames = get_files(data_dir) +split = int(len(images)*0.8) +test_images = images[split:] +test_filenames = filenames[split:] +train_images = images[:split] +train_filenames = filenames[:split] + +train_data = mx.gluon.data.ArrayDataset(nd.concatenate(train_images)) +train_dataloader = mx.gluon.data.DataLoader(train_data, batch_size=batch_size, shuffle=True, last_batch='rollover', num_workers=4) +``` + +## Generator +Define the Generator model. Architecture is taken from the DCGAN implementation in [Straight Dope Book](https://gluon.mxnet.io/chapter14_generative-adversarial-networks/dcgan.html). The Generator consist of 4 layers where each layer involves a strided convolution, batch normalization, and rectified nonlinearity. It takes as input random noise and the latent code `c` and produces an `(64,64,3)` output image. + + +```python +class Generator(gluon.HybridBlock): + def __init__(self, **kwargs): + super(Generator, self).__init__(**kwargs) + with self.name_scope(): + self.prev = nn.HybridSequential() + self.prev.add(nn.Dense(1024, use_bias=False), nn.BatchNorm(), nn.Activation(activation='relu')) + self.G = nn.HybridSequential() + + self.G.add(nn.Conv2DTranspose(64 * 8, 4, 1, 0, use_bias=False)) + self.G.add(nn.BatchNorm()) + self.G.add(nn.Activation('relu')) + self.G.add(nn.Conv2DTranspose(64 * 4, 4, 2, 1, use_bias=False)) + self.G.add(nn.BatchNorm()) + self.G.add(nn.Activation('relu')) + self.G.add(nn.Conv2DTranspose(64 * 2, 4, 2, 1, use_bias=False)) + self.G.add(nn.BatchNorm()) + self.G.add(nn.Activation('relu')) + self.G.add(nn.Conv2DTranspose(64, 4, 2, 1, use_bias=False)) + self.G.add(nn.BatchNorm()) + self.G.add(nn.Activation('relu')) + self.G.add(nn.Conv2DTranspose(3, 4, 2, 1, use_bias=False)) + self.G.add(nn.Activation('tanh')) + + def hybrid_forward(self, F, x): + x = self.prev(x) + x = F.reshape(x, (0, -1, 1, 1)) + return self.G(x) +``` + +## Discriminator +Define the Discriminator and Q model. The Q model shares many layers with the Discriminator. Its task is to estimate the code $c$ for a given fake image. It is used to maximize the lower bound to the mutual information. + + +```python +class Discriminator(gluon.HybridBlock): + def __init__(self, **kwargs): + super(Discriminator, self).__init__(**kwargs) + with self.name_scope(): + self.D = nn.HybridSequential() + self.D.add(nn.Conv2D(64, 4, 2, 1, use_bias=False)) + self.D.add(nn.LeakyReLU(0.2)) + self.D.add(nn.Conv2D(64 * 2, 4, 2, 1, use_bias=False)) + self.D.add(nn.BatchNorm()) + self.D.add(nn.LeakyReLU(0.2)) + self.D.add(nn.Conv2D(64 * 4, 4, 2, 1, use_bias=False)) + self.D.add(nn.BatchNorm()) + self.D.add(nn.LeakyReLU(0.2)) + self.D.add(nn.Conv2D(64 * 8, 4, 2, 1, use_bias=False)) + self.D.add(nn.BatchNorm()) + self.D.add(nn.LeakyReLU(0.2)) + + self.D.add(nn.Dense(1024, use_bias=False), nn.BatchNorm(), nn.Activation(activation='relu')) + + self.prob = nn.Dense(1)#, activation='sigmoid') + self.feat = nn.HybridSequential() + self.feat.add(nn.Dense(128, use_bias=False), nn.BatchNorm(), nn.Activation(activation='relu')) + self.category_prob = nn.Dense(n_categories) + self.continuous_mean = nn.Dense(n_continuous) + self.Q = nn.HybridSequential() + self.Q.add(self.feat, self.category_prob, self.continuous_mean) + + def hybrid_forward(self, F, x): + x = self.D(x) + prob = self.prob(x) + feat = self.feat(x) + category_prob = self.category_prob(feat) + continuous_mean = self.continuous_mean(feat) + + return prob, category_prob, continuous_mean +``` + +The InfoGAN has the following layout. + +Discriminator and Generator are the same as in the DCGAN example. On top of the Disciminator is the Q model, which is estimating the code `c` for given fake images. The Generator's input is random noise and the latent code `c`. + +## Training Loop +Initialize Generator and Discriminator and define correspoing trainer function. + + +```python +generator = Generator() +generator.hybridize() +generator.initialize(mx.init.Normal(0.002), ctx=ctx) + +discriminator = Discriminator() +discriminator.hybridize() +discriminator.initialize(mx.init.Normal(0.002), ctx=ctx) + +lr = 0.0001 +beta = 0.5 + +g_trainer = gluon.Trainer(generator.collect_params(), 'adam', {'learning_rate': lr, 'beta1': beta}) +d_trainer = gluon.Trainer(discriminator.collect_params(), 'adam', {'learning_rate': lr, 'beta1': beta}) +q_trainer = gluon.Trainer(discriminator.Q.collect_params(), 'adam', {'learning_rate': lr, 'beta1': beta}) +``` + +Create vectors with real (=1) and fake labels (=0). + + +```python +real_label = nd.ones((batch_size,), ctx=ctx) +fake_label = nd.zeros((batch_size,),ctx=ctx) +``` + +Load a pertrained model. + + +```python +if os.path.isfile("infogan_d_latest.params") and os.path.isfile("infogan_g_latest.params"): + discriminator.load_parameters('infogan_d_latest.params', ctx=ctx, allow_missing=True, ignore_extra=True) + generator.load_parameters('infogan_g_latest.params', ctx=ctx, allow_missing=True, ignore_extra=True) +``` + +The latent code $c$ is part of the Generator input and it contains mutliple variables (continuous, categorical) that can represent different distributions. In order to make sure that the Generator uses the latent code, mutual information is introduced into the GAN loss term. Mutual information measures how much X is known given Y or vice versa. It is defined as: + +$$I(X;Y) = entropy(X) - entropy(X|Y) = entropy(Y) - entropy(Y|X) $$ + +The InfoGAN loss is: +$$\min_{G} \max_{D} \, V(D, G) - \lambda I(c, G(z, c))$$ + +where $V(D,G)$ is the GAN loss and the mutual information $I(c, G(z, c))$ goes in as regularization. The goal is to reach high mutual information, in order to learn meaningful codes $c$ for the data. + + +Define the loss functions. `SoftmaxCrossEntropyLoss` for the categorical code `c`, `L2Loss` for the continious code `c` and `SigmoidBinaryCrossEntropyLoss` for the normal GAN loss. + + +```python +loss1 = gluon.loss.SigmoidBinaryCrossEntropyLoss() +loss2 = gluon.loss.L2Loss() +loss3 = gluon.loss.SoftmaxCrossEntropyLoss() +``` + +This function samples `c`, `z`, and concatenates them to create the generator input. + + +```python +def create_generator_input(): + + #create random noise + z = mx.nd.random_normal(0, 1, shape=(batch_size, z_dim), ctx=ctx) + label = nd.array(np.random.randint(n_categories, size=batch_size)).as_in_context(ctx) + c1 = nd.one_hot(label, depth=n_categories).as_in_context(ctx) + c2 = nd.random.uniform(-1, 1, shape=(batch_size, n_continuous)).as_in_context(ctx) + + # concatenate random noise with c which will be the input of the generator + return mx.nd.concat(z, c1, c2, dim=1) +``` + +Define the training loop. +1. The discriminator receives `real_data` and `loss1` measures how many real images have been identified as real +2. The discriminator receives `fake_image` from the Generator and `loss1` measures how many fake images have been identified as fake +3. Update Discriminator +4. The updated discriminator receives `fake_image` and `loss1` measures how many fake images have been been identified as real, `loss2` measures the difference between the sampled continuous latent code `c` and the output of the Q model and `loss3` measures the difference between the sampled categorical latent code `c` and the output of the Q model. +4. Update Generator and Q + + +```python +with SummaryWriter(logdir='./logs/') as sw: + + epochs = 1 + i = 0 + for epoch in range(epochs): + print("Epoch", epoch) + starttime = time.time() + + d_error_epoch = mx.nd.zeros((1,), ctx=ctx) + g_error_epoch = mx.nd.zeros((1,), ctx=ctx) + + for idx, data in enumerate(train_dataloader): + i = i + 1 + + #get real data and generator input + real_data = data.as_in_context(ctx) + g_input = create_generator_input() + + + #Update discriminator: Input real data and fake data + with autograd.record(): + output_real,_,_ = discriminator(real_data) + d_error_real = loss1(output_real, real_label) + + # create fake image and input it to discriminator + fake_image = generator(g_input) + output_fake,_,_ = discriminator(fake_image.detach()) + d_error_fake = loss1(output_fake, fake_label) + + # total discriminator error + d_error = d_error_real + d_error_fake + + d_error_epoch += d_error.mean() + if i % 2 == 0: + d_error.backward() + d_trainer.step(data.shape[0]) + + #Update generator: Input random noise and latent code vector + with autograd.record(): + fake_image = generator(g_input) + output_fake, category_prob, continuous_mean = discriminator(fake_image) + g_error = loss1(output_fake, real_label) + loss3(category_prob, label) + loss2(c2, continuous_mean) + + g_error.backward() + g_error_epoch += g_error.mean() + + g_trainer.step(data.shape[0]) + q_trainer.step(data.shape[0]) + + # logging + if idx % 10 == 0: + + logging.info('speed: {} samples/s'.format(batch_size / (time.time() - starttime))) + logging.info('discriminator loss = %f, generator loss = %f at iter %d epoch %d' + %(d_error_epoch.asscalar()/idx,g_error_epoch.asscalar()/idx, idx, epoch)) + + g_input = create_generator_input() + + # create some fake image for logging in MXBoard + fake_image = generator(g_input) + + sw.add_scalar(tag='Loss_D', value={'test':d_error_epoch.asscalar()/idx}, global_step=i) + sw.add_scalar(tag='Loss_G', value={'test':d_error_epoch.asscalar()/idx}, global_step=i) + sw.add_image(tag='data_image', image=((fake_image[0]+ 1.0) * 127.5).astype(np.uint8) , global_step=i) + sw.flush() + + time1 = time.time() + + #discriminator.save_parameters("infogan_d.params") + #generator.save_parameters("infogan_g.params") +``` + +## Image similarity +Once the InfoGAN is trained, we can use the Discriminator to do an image similarity search. The idea is that the network learned meaningful features from the images based on the mutual information e.g. pose of people in an image. + +Load the trained discriminator and retrieve one of its last layers. + + +```python +discriminator = Discriminator() +discriminator.load_parameters("infogan_d_latest.params", ctx=ctx, ignore_extra=True) + +discriminator = discriminator.D[:11] +print (discriminator) + +discriminator.hybridize() +``` + +Nearest neighbor function, which takes a matrix of features and an input feature vector. It returns the 3 closest features. + + +```python +def get_knn(features, input_vector, k=3): + dist = (nd.square(features - input_vector).sum(axis=1))/features.shape[0] + print (np.sort(dist.asnumpy())[:10]) + indices = dist.asnumpy().argsort()[:k] + return [(index, dist[index].asscalar()) for index in indices] +``` + +A helper function to visualize image data. + + +```python +def visualize(img_array): + plt.imshow(((img_array.asnumpy().transpose(1, 2, 0) + 1.0) * 127.5).astype(np.uint8)) + plt.axis('off') +``` + +Take some images from the test data, obtain its feature vector from `discriminator.D[:11]` and plot images of the corresponding closest vectors in the feature space. + + +```python +feature_size = 8192 + +features = mx.nd.zeros((len(test_images), feature_size), ctx=ctx) + +for idx, image in enumerate(test_images): + + feature = discriminator(mx.nd.array(image)) + feature = feature.reshape(feature_size,) + features[idx,:] = feature.copyto(ctx) + + +for image in test_images[:100]: + + feature = discriminator(mx.nd.array(image)) + feature = feature.reshape((feature_size,)) + image = image.reshape((3,64,64)) + + + indices = get_knn(features, feature, k=10) + fig = plt.figure(figsize=(15,12)) + plt.subplot(1,10,1) + + visualize(image) + for i in range(2,9): + if indices[i-1][1] < 1.5: + plt.subplot(1,10,i) + sim = test_images[indices[i-1][0]].reshape(3,64,64) + visualize(sim) + plt.show() + plt.clf() +``` + +## How the Generator learns +We trained the Generator for a couple of epochs and stored a couple of fake images per epoch. Check the video. + ![alt text](https://raw.githubusercontent.com/NRauschmayr/InfoGAN_Gluon/master/images/infogan.gif) + + +The following function computes the TSNE on the feature matrix and stores the result in a json-file. This file can be loaded with [TSNEViewer](https://ml4a.github.io/guides/ImageTSNEViewer/) + + +```python +from sklearn.manifold import TSNE +from scipy.spatial import distance +import os +import json + +tsne = TSNE(n_components=2, learning_rate=150, perplexity=30, verbose=2).fit_transform(features.asnumpy()) +# save data to json +data = [] +counter = 0 +for i,f in enumerate(test_filenames): + + point = [float((tsne[i,k] - np.min(tsne[:,k]))/(np.max(tsne[:,k]) - np.min(tsne[:,k]))) for k in range(2) ] + data.append({"path": os.path.abspath(os.path.join(os.getcwd(),f)), "point": point}) + +with open("imagetsne.json", 'w') as outfile: + json.dump(data, outfile) +``` + +Load the file with TSNEViewer. You can now inspect whether similiar looking images are grouped nearby or not. + + From f9e012004ebde5a88b7bc1ccd606696da8f1b9ba Mon Sep 17 00:00:00 2001 From: NRauschmayr Date: Tue, 6 Nov 2018 13:22:40 -0800 Subject: [PATCH 02/20] adjust paths of filenames --- docs/tutorials/gluon/info_gan.md | 6 +++--- tests/tutorials/test_tutorials.py | 5 ++++- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/docs/tutorials/gluon/info_gan.md b/docs/tutorials/gluon/info_gan.md index 993f9fe0f2a1..7419e1bf5d63 100644 --- a/docs/tutorials/gluon/info_gan.md +++ b/docs/tutorials/gluon/info_gan.md @@ -174,7 +174,7 @@ class Discriminator(gluon.HybridBlock): ``` The InfoGAN has the following layout. - + Discriminator and Generator are the same as in the DCGAN example. On top of the Disciminator is the Q model, which is estimating the code `c` for given fake images. The Generator's input is random noise and the latent code `c`. ## Training Loop @@ -405,7 +405,7 @@ for image in test_images[:100]: ## How the Generator learns We trained the Generator for a couple of epochs and stored a couple of fake images per epoch. Check the video. - ![alt text](https://raw.githubusercontent.com/NRauschmayr/InfoGAN_Gluon/master/images/infogan.gif) + ![alt text](https://raw.githubusercontent.com/NRauschmayr/web-data/master/mxnet/doc/tutorials/info_gan/infogan.gif) The following function computes the TSNE on the feature matrix and stores the result in a json-file. This file can be loaded with [TSNEViewer](https://ml4a.github.io/guides/ImageTSNEViewer/) @@ -432,4 +432,4 @@ with open("imagetsne.json", 'w') as outfile: Load the file with TSNEViewer. You can now inspect whether similiar looking images are grouped nearby or not. - + diff --git a/tests/tutorials/test_tutorials.py b/tests/tutorials/test_tutorials.py index 5b8e2152bc75..c9ad00c189ad 100644 --- a/tests/tutorials/test_tutorials.py +++ b/tests/tutorials/test_tutorials.py @@ -117,7 +117,10 @@ def test_gluon_learning_rate_schedules(): def test_gluon_learning_rate_schedules_advanced(): assert _test_tutorial_nb('gluon/learning_rate_schedules_advanced') - + +def test_gluon_info_gan(): + assert _test_tutorial_nb('gluon/info_gan') + def test_nlp_cnn(): assert _test_tutorial_nb('nlp/cnn') From 73550d1269a331b2d48191af9a85958b51f8d6ca Mon Sep 17 00:00:00 2001 From: NRauschmayr Date: Tue, 6 Nov 2018 13:40:13 -0800 Subject: [PATCH 03/20] Update index.md --- docs/tutorials/index.md | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/tutorials/index.md b/docs/tutorials/index.md index 07f32b501d92..49239e12086d 100644 --- a/docs/tutorials/index.md +++ b/docs/tutorials/index.md @@ -57,6 +57,7 @@ Select API:  * [Logistic Regression](/tutorials/gluon/logistic_regression_explained.html) * [Word-level text generation with RNN, LSTM and GRU](http://gluon.mxnet.io/chapter05_recurrent-neural-networks/rnns-gluon.html) External link * [Visual Question Answering](http://gluon.mxnet.io/chapter08_computer-vision/visual-question-answer.html) External link + * [Image similiarity search with InfoGAN](/tutorials/gluon/info_gan.html)External link * Practitioner Guides * [Gotchas using NumPy](/tutorials/gluon/gotchas_numpy_in_mxnet.html) * [Multi-GPU training](http://gluon.mxnet.io/chapter07_distributed-learning/multiple-gpus-gluon.html) External link From 307b76c2e1597332d122aea8d8afd764c2faf67d Mon Sep 17 00:00:00 2001 From: NRauschmayr Date: Tue, 6 Nov 2018 13:42:54 -0800 Subject: [PATCH 04/20] Update index.md --- docs/tutorials/index.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tutorials/index.md b/docs/tutorials/index.md index 49239e12086d..458d2fbe30b9 100644 --- a/docs/tutorials/index.md +++ b/docs/tutorials/index.md @@ -57,7 +57,7 @@ Select API:  * [Logistic Regression](/tutorials/gluon/logistic_regression_explained.html) * [Word-level text generation with RNN, LSTM and GRU](http://gluon.mxnet.io/chapter05_recurrent-neural-networks/rnns-gluon.html) External link * [Visual Question Answering](http://gluon.mxnet.io/chapter08_computer-vision/visual-question-answer.html) External link - * [Image similiarity search with InfoGAN](/tutorials/gluon/info_gan.html)External link + * [Image similiarity search with InfoGAN](/tutorials/gluon/info_gan.md)External link * Practitioner Guides * [Gotchas using NumPy](/tutorials/gluon/gotchas_numpy_in_mxnet.html) * [Multi-GPU training](http://gluon.mxnet.io/chapter07_distributed-learning/multiple-gpus-gluon.html) External link From ac1843020138cc4d574caaee89dd0c0752e99359 Mon Sep 17 00:00:00 2001 From: NRauschmayr Date: Tue, 6 Nov 2018 13:44:36 -0800 Subject: [PATCH 05/20] Update index.md --- docs/tutorials/index.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tutorials/index.md b/docs/tutorials/index.md index 458d2fbe30b9..e4e74e383603 100644 --- a/docs/tutorials/index.md +++ b/docs/tutorials/index.md @@ -57,7 +57,7 @@ Select API:  * [Logistic Regression](/tutorials/gluon/logistic_regression_explained.html) * [Word-level text generation with RNN, LSTM and GRU](http://gluon.mxnet.io/chapter05_recurrent-neural-networks/rnns-gluon.html) External link * [Visual Question Answering](http://gluon.mxnet.io/chapter08_computer-vision/visual-question-answer.html) External link - * [Image similiarity search with InfoGAN](/tutorials/gluon/info_gan.md)External link + * [Image similiarity search with InfoGAN](/docs/tutorials/gluon/info_gan.md)External link * Practitioner Guides * [Gotchas using NumPy](/tutorials/gluon/gotchas_numpy_in_mxnet.html) * [Multi-GPU training](http://gluon.mxnet.io/chapter07_distributed-learning/multiple-gpus-gluon.html) External link From e476ad2d24a356c52cd097584724f3200af55a33 Mon Sep 17 00:00:00 2001 From: Nathalie Rauschmayr Date: Tue, 6 Nov 2018 14:13:31 -0800 Subject: [PATCH 06/20] Update info_gan.md Added an image --- docs/tutorials/gluon/info_gan.md | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/tutorials/gluon/info_gan.md b/docs/tutorials/gluon/info_gan.md index 7419e1bf5d63..5625d44f9135 100644 --- a/docs/tutorials/gluon/info_gan.md +++ b/docs/tutorials/gluon/info_gan.md @@ -402,6 +402,7 @@ for image in test_images[:100]: plt.show() plt.clf() ``` +![png](https://raw.githubusercontent.com/NRauschmayr/web-data/master/mxnet/doc/tutorials/info_gan/output.png) ## How the Generator learns We trained the Generator for a couple of epochs and stored a couple of fake images per epoch. Check the video. From 5de6b15b56af9d1e9340931b7c0ca6f57348e2d7 Mon Sep 17 00:00:00 2001 From: Nathalie Rauschmayr Date: Tue, 6 Nov 2018 15:05:24 -0800 Subject: [PATCH 07/20] Update info_gan.md Applied some fixes --- docs/tutorials/gluon/info_gan.md | 37 ++++++++++++++++---------------- 1 file changed, 19 insertions(+), 18 deletions(-) diff --git a/docs/tutorials/gluon/info_gan.md b/docs/tutorials/gluon/info_gan.md index 5625d44f9135..daef402b76b9 100644 --- a/docs/tutorials/gluon/info_gan.md +++ b/docs/tutorials/gluon/info_gan.md @@ -7,24 +7,22 @@ The codes are made meaningful by maximizing the mutual information between code ```python from __future__ import print_function +from datetime import datetime import sys import os -import matplotlib as mpl +import logging +import time import tarfile -import matplotlib.image as mpimg -from matplotlib import pyplot as plt -from mxboard import SummaryWriter +from matplotlib import pyplot as plt import mxnet as mx from mxnet import gluon from mxnet import ndarray as nd from mxnet.gluon import nn, utils from mxnet import autograd +from mxboard import SummaryWriter import numpy as np -from datetime import datetime -import time -import logging ``` The latent code vector c can contain several variables, which can be categorical and/or continuous. We set `n_continuous` to 2 and `n_categories` to 10. @@ -50,7 +48,6 @@ if not os.path.exists(data_path): with tarfile.open(data_file) as tar: tar.extractall(path=data_path) - ``` @@ -132,7 +129,7 @@ class Generator(gluon.HybridBlock): ``` ## Discriminator -Define the Discriminator and Q model. The Q model shares many layers with the Discriminator. Its task is to estimate the code $c$ for a given fake image. It is used to maximize the lower bound to the mutual information. +Define the Discriminator and Q model. The Q model shares many layers with the Discriminator. Its task is to estimate the code `c` for a given fake image. It is used to maximize the lower bound to the mutual information. ```python @@ -155,7 +152,7 @@ class Discriminator(gluon.HybridBlock): self.D.add(nn.Dense(1024, use_bias=False), nn.BatchNorm(), nn.Activation(activation='relu')) - self.prob = nn.Dense(1)#, activation='sigmoid') + self.prob = nn.Dense(1) self.feat = nn.HybridSequential() self.feat.add(nn.Dense(128, use_bias=False), nn.BatchNorm(), nn.Activation(activation='relu')) self.category_prob = nn.Dense(n_categories) @@ -217,12 +214,13 @@ if os.path.isfile("infogan_d_latest.params") and os.path.isfile("infogan_g_lates The latent code $c$ is part of the Generator input and it contains mutliple variables (continuous, categorical) that can represent different distributions. In order to make sure that the Generator uses the latent code, mutual information is introduced into the GAN loss term. Mutual information measures how much X is known given Y or vice versa. It is defined as: -$$I(X;Y) = entropy(X) - entropy(X|Y) = entropy(Y) - entropy(Y|X) $$ +![gif](https://raw.githubusercontent.com/NRauschmayr/web-data/master/mxnet/doc/tutorials/info_gan/entropy.gif) The InfoGAN loss is: -$$\min_{G} \max_{D} \, V(D, G) - \lambda I(c, G(z, c))$$ -where $V(D,G)$ is the GAN loss and the mutual information $I(c, G(z, c))$ goes in as regularization. The goal is to reach high mutual information, in order to learn meaningful codes $c$ for the data. +![gif](https://raw.githubusercontent.com/NRauschmayr/web-data/master/mxnet/doc/tutorials/info_gan/loss.gif) + +where `V(D,G)` is the GAN loss and the mutual information `I(c, G(z, c))` goes in as regularization. The goal is to reach high mutual information, in order to learn meaningful codes `c` for the data. Define the loss functions. `SoftmaxCrossEntropyLoss` for the categorical code `c`, `L2Loss` for the continious code `c` and `SigmoidBinaryCrossEntropyLoss` for the normal GAN loss. @@ -253,7 +251,7 @@ def create_generator_input(): Define the training loop. 1. The discriminator receives `real_data` and `loss1` measures how many real images have been identified as real 2. The discriminator receives `fake_image` from the Generator and `loss1` measures how many fake images have been identified as fake -3. Update Discriminator +3. Update Discriminator. Currently, it is updated every second iteration in order to avoid that the Discriminator becomes too strong. You may want to change that. 4. The updated discriminator receives `fake_image` and `loss1` measures how many fake images have been been identified as real, `loss2` measures the difference between the sampled continuous latent code `c` and the output of the Q model and `loss3` measures the difference between the sampled categorical latent code `c` and the output of the Q model. 4. Update Generator and Q @@ -292,6 +290,8 @@ with SummaryWriter(logdir='./logs/') as sw: d_error = d_error_real + d_error_fake d_error_epoch += d_error.mean() + + #Update D every second iteration if i % 2 == 0: d_error.backward() d_trainer.step(data.shape[0]) @@ -327,8 +327,8 @@ with SummaryWriter(logdir='./logs/') as sw: time1 = time.time() - #discriminator.save_parameters("infogan_d.params") - #generator.save_parameters("infogan_g.params") + discriminator.save_parameters("infogan_d_latest.params") + generator.save_parameters("infogan_g_latest.params") ``` ## Image similarity @@ -413,12 +413,13 @@ The following function computes the TSNE on the feature matrix and stores the re ```python +import json + from sklearn.manifold import TSNE from scipy.spatial import distance -import os -import json tsne = TSNE(n_components=2, learning_rate=150, perplexity=30, verbose=2).fit_transform(features.asnumpy()) + # save data to json data = [] counter = 0 From a59a811cb9262d7e6c1ae9ce5bb0d0da27a1ad70 Mon Sep 17 00:00:00 2001 From: Nathalie Rauschmayr Date: Tue, 6 Nov 2018 15:08:43 -0800 Subject: [PATCH 08/20] Update info_gan.md Applied some fixes --- docs/tutorials/gluon/info_gan.md | 1 - 1 file changed, 1 deletion(-) diff --git a/docs/tutorials/gluon/info_gan.md b/docs/tutorials/gluon/info_gan.md index daef402b76b9..55e9660e23aa 100644 --- a/docs/tutorials/gluon/info_gan.md +++ b/docs/tutorials/gluon/info_gan.md @@ -353,7 +353,6 @@ Nearest neighbor function, which takes a matrix of features and an input feature ```python def get_knn(features, input_vector, k=3): dist = (nd.square(features - input_vector).sum(axis=1))/features.shape[0] - print (np.sort(dist.asnumpy())[:10]) indices = dist.asnumpy().argsort()[:k] return [(index, dist[index].asscalar()) for index in indices] ``` From 84037060c6357020fbbd9712131c633e93cfcb6e Mon Sep 17 00:00:00 2001 From: Nathalie Rauschmayr Date: Tue, 6 Nov 2018 15:13:25 -0800 Subject: [PATCH 09/20] Update info_gan.md Applied some fixes --- docs/tutorials/gluon/info_gan.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tutorials/gluon/info_gan.md b/docs/tutorials/gluon/info_gan.md index 55e9660e23aa..88ab572a60fd 100644 --- a/docs/tutorials/gluon/info_gan.md +++ b/docs/tutorials/gluon/info_gan.md @@ -33,7 +33,7 @@ batch_size = 64 z_dim = 100 n_continuous = 2 n_categories = 10 -ctx = mx.cpu() +ctx = mx.gpu() if mx.test_utils.list_gpus() else mx.cpu() ``` Some functions to load and normalize images. From 7d7470a7be9d221dfd556e60575fa0fc1e98b952 Mon Sep 17 00:00:00 2001 From: Thomas Delteil Date: Tue, 6 Nov 2018 15:17:51 -0800 Subject: [PATCH 10/20] Update info_gan.md --- docs/tutorials/gluon/info_gan.md | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/docs/tutorials/gluon/info_gan.md b/docs/tutorials/gluon/info_gan.md index 88ab572a60fd..31e13b43b85d 100644 --- a/docs/tutorials/gluon/info_gan.md +++ b/docs/tutorials/gluon/info_gan.md @@ -88,10 +88,10 @@ split = int(len(images)*0.8) test_images = images[split:] test_filenames = filenames[split:] train_images = images[:split] -train_filenames = filenames[:split] +train_filenames = filenames[:split] -train_data = mx.gluon.data.ArrayDataset(nd.concatenate(train_images)) -train_dataloader = mx.gluon.data.DataLoader(train_data, batch_size=batch_size, shuffle=True, last_batch='rollover', num_workers=4) +train_data = gluon.data.ArrayDataset(nd.concatenate(train_images)) +train_dataloader = gluon.data.DataLoader(train_data, batch_size=batch_size, shuffle=True, last_batch='rollover', num_workers=4) ``` ## Generator @@ -239,13 +239,13 @@ This function samples `c`, `z`, and concatenates them to create the generator in def create_generator_input(): #create random noise - z = mx.nd.random_normal(0, 1, shape=(batch_size, z_dim), ctx=ctx) + z = nd.random_normal(0, 1, shape=(batch_size, z_dim), ctx=ctx) label = nd.array(np.random.randint(n_categories, size=batch_size)).as_in_context(ctx) c1 = nd.one_hot(label, depth=n_categories).as_in_context(ctx) c2 = nd.random.uniform(-1, 1, shape=(batch_size, n_continuous)).as_in_context(ctx) # concatenate random noise with c which will be the input of the generator - return mx.nd.concat(z, c1, c2, dim=1) + return nd.concat(z, c1, c2, dim=1) ``` Define the training loop. @@ -265,8 +265,8 @@ with SummaryWriter(logdir='./logs/') as sw: print("Epoch", epoch) starttime = time.time() - d_error_epoch = mx.nd.zeros((1,), ctx=ctx) - g_error_epoch = mx.nd.zeros((1,), ctx=ctx) + d_error_epoch = nd.zeros((1,), ctx=ctx) + g_error_epoch = nd.zeros((1,), ctx=ctx) for idx, data in enumerate(train_dataloader): i = i + 1 @@ -372,11 +372,11 @@ Take some images from the test data, obtain its feature vector from `discriminat ```python feature_size = 8192 -features = mx.nd.zeros((len(test_images), feature_size), ctx=ctx) +features = nd.zeros((len(test_images), feature_size), ctx=ctx) for idx, image in enumerate(test_images): - feature = discriminator(mx.nd.array(image)) + feature = discriminator(nd.array(image)) feature = feature.reshape(feature_size,) features[idx,:] = feature.copyto(ctx) @@ -434,3 +434,5 @@ with open("imagetsne.json", 'w') as outfile: Load the file with TSNEViewer. You can now inspect whether similiar looking images are grouped nearby or not. + + From ea0faab0e5b12f59c9f575341072fda0546c4358 Mon Sep 17 00:00:00 2001 From: NRauschmayr Date: Tue, 6 Nov 2018 15:26:33 -0800 Subject: [PATCH 11/20] Updated index.md file --- docs/tutorials/gluon/info_gan.md.1 | 436 +++++++++++++++++++++++++++++ docs/tutorials/index.md | 2 +- 2 files changed, 437 insertions(+), 1 deletion(-) create mode 100644 docs/tutorials/gluon/info_gan.md.1 diff --git a/docs/tutorials/gluon/info_gan.md.1 b/docs/tutorials/gluon/info_gan.md.1 new file mode 100644 index 000000000000..5625d44f9135 --- /dev/null +++ b/docs/tutorials/gluon/info_gan.md.1 @@ -0,0 +1,436 @@ + +# Image similarity search with InfoGAN + +This notebook shows how to implement an InfoGAN based on Gluon. InfoGAN is an extension of GANs, where the generator input is split in 2 parts: random noise and a latent code c (see [InfoGAN Paper](https://arxiv.org/pdf/1606.03657.pdf)). +The codes are made meaningful by maximizing the mutual information between code and generator output. InfoGAN learns a disentangled representation in a completely unsupervised manner. It can be used for many applications such as image similarity search. This notebook uses the DCGAN example from the [Straight Dope Book](https://gluon.mxnet.io/chapter14_generative-adversarial-networks/dcgan.html) and extends it to create an InfoGAN. + + +```python +from __future__ import print_function +import sys +import os +import matplotlib as mpl +import tarfile +import matplotlib.image as mpimg +from matplotlib import pyplot as plt +from mxboard import SummaryWriter + +import mxnet as mx +from mxnet import gluon +from mxnet import ndarray as nd +from mxnet.gluon import nn, utils +from mxnet import autograd +import numpy as np + +from datetime import datetime +import time +import logging +``` + +The latent code vector c can contain several variables, which can be categorical and/or continuous. We set `n_continuous` to 2 and `n_categories` to 10. + + +```python +batch_size = 64 +z_dim = 100 +n_continuous = 2 +n_categories = 10 +ctx = mx.cpu() +``` + +Some functions to load and normalize images. + + +```python +lfw_url = 'http://vis-www.cs.umass.edu/lfw/lfw-deepfunneled.tgz' +data_path = 'lfw_dataset' +if not os.path.exists(data_path): + os.makedirs(data_path) + data_file = utils.download(lfw_url) + with tarfile.open(data_file) as tar: + tar.extractall(path=data_path) + + +``` + + +```python +def transform(data, width=64, height=64): + data = mx.image.imresize(data, width, height) + data = nd.transpose(data, (2,0,1)) + data = data.astype(np.float32)/127.5 - 1 + if data.shape[0] == 1: + data = nd.tile(data, (3, 1, 1)) + return data.reshape((1,) + data.shape) +``` + + +```python +def get_files(data_dir): + images = [] + filenames = [] + for path, _, fnames in os.walk(data_dir): + for fname in fnames: + if not fname.endswith('.jpg'): + continue + img = os.path.join(path, fname) + img_arr = mx.image.imread(img) + img_arr = transform(img_arr) + images.append(img_arr) + filenames.append(path + "/" + fname) + return images, filenames +``` + +Load the dataset `lfw_dataset` which contains images of celebrities. + + +```python +data_dir = 'lfw_dataset' +images, filenames = get_files(data_dir) +split = int(len(images)*0.8) +test_images = images[split:] +test_filenames = filenames[split:] +train_images = images[:split] +train_filenames = filenames[:split] + +train_data = mx.gluon.data.ArrayDataset(nd.concatenate(train_images)) +train_dataloader = mx.gluon.data.DataLoader(train_data, batch_size=batch_size, shuffle=True, last_batch='rollover', num_workers=4) +``` + +## Generator +Define the Generator model. Architecture is taken from the DCGAN implementation in [Straight Dope Book](https://gluon.mxnet.io/chapter14_generative-adversarial-networks/dcgan.html). The Generator consist of 4 layers where each layer involves a strided convolution, batch normalization, and rectified nonlinearity. It takes as input random noise and the latent code `c` and produces an `(64,64,3)` output image. + + +```python +class Generator(gluon.HybridBlock): + def __init__(self, **kwargs): + super(Generator, self).__init__(**kwargs) + with self.name_scope(): + self.prev = nn.HybridSequential() + self.prev.add(nn.Dense(1024, use_bias=False), nn.BatchNorm(), nn.Activation(activation='relu')) + self.G = nn.HybridSequential() + + self.G.add(nn.Conv2DTranspose(64 * 8, 4, 1, 0, use_bias=False)) + self.G.add(nn.BatchNorm()) + self.G.add(nn.Activation('relu')) + self.G.add(nn.Conv2DTranspose(64 * 4, 4, 2, 1, use_bias=False)) + self.G.add(nn.BatchNorm()) + self.G.add(nn.Activation('relu')) + self.G.add(nn.Conv2DTranspose(64 * 2, 4, 2, 1, use_bias=False)) + self.G.add(nn.BatchNorm()) + self.G.add(nn.Activation('relu')) + self.G.add(nn.Conv2DTranspose(64, 4, 2, 1, use_bias=False)) + self.G.add(nn.BatchNorm()) + self.G.add(nn.Activation('relu')) + self.G.add(nn.Conv2DTranspose(3, 4, 2, 1, use_bias=False)) + self.G.add(nn.Activation('tanh')) + + def hybrid_forward(self, F, x): + x = self.prev(x) + x = F.reshape(x, (0, -1, 1, 1)) + return self.G(x) +``` + +## Discriminator +Define the Discriminator and Q model. The Q model shares many layers with the Discriminator. Its task is to estimate the code $c$ for a given fake image. It is used to maximize the lower bound to the mutual information. + + +```python +class Discriminator(gluon.HybridBlock): + def __init__(self, **kwargs): + super(Discriminator, self).__init__(**kwargs) + with self.name_scope(): + self.D = nn.HybridSequential() + self.D.add(nn.Conv2D(64, 4, 2, 1, use_bias=False)) + self.D.add(nn.LeakyReLU(0.2)) + self.D.add(nn.Conv2D(64 * 2, 4, 2, 1, use_bias=False)) + self.D.add(nn.BatchNorm()) + self.D.add(nn.LeakyReLU(0.2)) + self.D.add(nn.Conv2D(64 * 4, 4, 2, 1, use_bias=False)) + self.D.add(nn.BatchNorm()) + self.D.add(nn.LeakyReLU(0.2)) + self.D.add(nn.Conv2D(64 * 8, 4, 2, 1, use_bias=False)) + self.D.add(nn.BatchNorm()) + self.D.add(nn.LeakyReLU(0.2)) + + self.D.add(nn.Dense(1024, use_bias=False), nn.BatchNorm(), nn.Activation(activation='relu')) + + self.prob = nn.Dense(1)#, activation='sigmoid') + self.feat = nn.HybridSequential() + self.feat.add(nn.Dense(128, use_bias=False), nn.BatchNorm(), nn.Activation(activation='relu')) + self.category_prob = nn.Dense(n_categories) + self.continuous_mean = nn.Dense(n_continuous) + self.Q = nn.HybridSequential() + self.Q.add(self.feat, self.category_prob, self.continuous_mean) + + def hybrid_forward(self, F, x): + x = self.D(x) + prob = self.prob(x) + feat = self.feat(x) + category_prob = self.category_prob(feat) + continuous_mean = self.continuous_mean(feat) + + return prob, category_prob, continuous_mean +``` + +The InfoGAN has the following layout. + +Discriminator and Generator are the same as in the DCGAN example. On top of the Disciminator is the Q model, which is estimating the code `c` for given fake images. The Generator's input is random noise and the latent code `c`. + +## Training Loop +Initialize Generator and Discriminator and define correspoing trainer function. + + +```python +generator = Generator() +generator.hybridize() +generator.initialize(mx.init.Normal(0.002), ctx=ctx) + +discriminator = Discriminator() +discriminator.hybridize() +discriminator.initialize(mx.init.Normal(0.002), ctx=ctx) + +lr = 0.0001 +beta = 0.5 + +g_trainer = gluon.Trainer(generator.collect_params(), 'adam', {'learning_rate': lr, 'beta1': beta}) +d_trainer = gluon.Trainer(discriminator.collect_params(), 'adam', {'learning_rate': lr, 'beta1': beta}) +q_trainer = gluon.Trainer(discriminator.Q.collect_params(), 'adam', {'learning_rate': lr, 'beta1': beta}) +``` + +Create vectors with real (=1) and fake labels (=0). + + +```python +real_label = nd.ones((batch_size,), ctx=ctx) +fake_label = nd.zeros((batch_size,),ctx=ctx) +``` + +Load a pertrained model. + + +```python +if os.path.isfile("infogan_d_latest.params") and os.path.isfile("infogan_g_latest.params"): + discriminator.load_parameters('infogan_d_latest.params', ctx=ctx, allow_missing=True, ignore_extra=True) + generator.load_parameters('infogan_g_latest.params', ctx=ctx, allow_missing=True, ignore_extra=True) +``` + +The latent code $c$ is part of the Generator input and it contains mutliple variables (continuous, categorical) that can represent different distributions. In order to make sure that the Generator uses the latent code, mutual information is introduced into the GAN loss term. Mutual information measures how much X is known given Y or vice versa. It is defined as: + +$$I(X;Y) = entropy(X) - entropy(X|Y) = entropy(Y) - entropy(Y|X) $$ + +The InfoGAN loss is: +$$\min_{G} \max_{D} \, V(D, G) - \lambda I(c, G(z, c))$$ + +where $V(D,G)$ is the GAN loss and the mutual information $I(c, G(z, c))$ goes in as regularization. The goal is to reach high mutual information, in order to learn meaningful codes $c$ for the data. + + +Define the loss functions. `SoftmaxCrossEntropyLoss` for the categorical code `c`, `L2Loss` for the continious code `c` and `SigmoidBinaryCrossEntropyLoss` for the normal GAN loss. + + +```python +loss1 = gluon.loss.SigmoidBinaryCrossEntropyLoss() +loss2 = gluon.loss.L2Loss() +loss3 = gluon.loss.SoftmaxCrossEntropyLoss() +``` + +This function samples `c`, `z`, and concatenates them to create the generator input. + + +```python +def create_generator_input(): + + #create random noise + z = mx.nd.random_normal(0, 1, shape=(batch_size, z_dim), ctx=ctx) + label = nd.array(np.random.randint(n_categories, size=batch_size)).as_in_context(ctx) + c1 = nd.one_hot(label, depth=n_categories).as_in_context(ctx) + c2 = nd.random.uniform(-1, 1, shape=(batch_size, n_continuous)).as_in_context(ctx) + + # concatenate random noise with c which will be the input of the generator + return mx.nd.concat(z, c1, c2, dim=1) +``` + +Define the training loop. +1. The discriminator receives `real_data` and `loss1` measures how many real images have been identified as real +2. The discriminator receives `fake_image` from the Generator and `loss1` measures how many fake images have been identified as fake +3. Update Discriminator +4. The updated discriminator receives `fake_image` and `loss1` measures how many fake images have been been identified as real, `loss2` measures the difference between the sampled continuous latent code `c` and the output of the Q model and `loss3` measures the difference between the sampled categorical latent code `c` and the output of the Q model. +4. Update Generator and Q + + +```python +with SummaryWriter(logdir='./logs/') as sw: + + epochs = 1 + i = 0 + for epoch in range(epochs): + print("Epoch", epoch) + starttime = time.time() + + d_error_epoch = mx.nd.zeros((1,), ctx=ctx) + g_error_epoch = mx.nd.zeros((1,), ctx=ctx) + + for idx, data in enumerate(train_dataloader): + i = i + 1 + + #get real data and generator input + real_data = data.as_in_context(ctx) + g_input = create_generator_input() + + + #Update discriminator: Input real data and fake data + with autograd.record(): + output_real,_,_ = discriminator(real_data) + d_error_real = loss1(output_real, real_label) + + # create fake image and input it to discriminator + fake_image = generator(g_input) + output_fake,_,_ = discriminator(fake_image.detach()) + d_error_fake = loss1(output_fake, fake_label) + + # total discriminator error + d_error = d_error_real + d_error_fake + + d_error_epoch += d_error.mean() + if i % 2 == 0: + d_error.backward() + d_trainer.step(data.shape[0]) + + #Update generator: Input random noise and latent code vector + with autograd.record(): + fake_image = generator(g_input) + output_fake, category_prob, continuous_mean = discriminator(fake_image) + g_error = loss1(output_fake, real_label) + loss3(category_prob, label) + loss2(c2, continuous_mean) + + g_error.backward() + g_error_epoch += g_error.mean() + + g_trainer.step(data.shape[0]) + q_trainer.step(data.shape[0]) + + # logging + if idx % 10 == 0: + + logging.info('speed: {} samples/s'.format(batch_size / (time.time() - starttime))) + logging.info('discriminator loss = %f, generator loss = %f at iter %d epoch %d' + %(d_error_epoch.asscalar()/idx,g_error_epoch.asscalar()/idx, idx, epoch)) + + g_input = create_generator_input() + + # create some fake image for logging in MXBoard + fake_image = generator(g_input) + + sw.add_scalar(tag='Loss_D', value={'test':d_error_epoch.asscalar()/idx}, global_step=i) + sw.add_scalar(tag='Loss_G', value={'test':d_error_epoch.asscalar()/idx}, global_step=i) + sw.add_image(tag='data_image', image=((fake_image[0]+ 1.0) * 127.5).astype(np.uint8) , global_step=i) + sw.flush() + + time1 = time.time() + + #discriminator.save_parameters("infogan_d.params") + #generator.save_parameters("infogan_g.params") +``` + +## Image similarity +Once the InfoGAN is trained, we can use the Discriminator to do an image similarity search. The idea is that the network learned meaningful features from the images based on the mutual information e.g. pose of people in an image. + +Load the trained discriminator and retrieve one of its last layers. + + +```python +discriminator = Discriminator() +discriminator.load_parameters("infogan_d_latest.params", ctx=ctx, ignore_extra=True) + +discriminator = discriminator.D[:11] +print (discriminator) + +discriminator.hybridize() +``` + +Nearest neighbor function, which takes a matrix of features and an input feature vector. It returns the 3 closest features. + + +```python +def get_knn(features, input_vector, k=3): + dist = (nd.square(features - input_vector).sum(axis=1))/features.shape[0] + print (np.sort(dist.asnumpy())[:10]) + indices = dist.asnumpy().argsort()[:k] + return [(index, dist[index].asscalar()) for index in indices] +``` + +A helper function to visualize image data. + + +```python +def visualize(img_array): + plt.imshow(((img_array.asnumpy().transpose(1, 2, 0) + 1.0) * 127.5).astype(np.uint8)) + plt.axis('off') +``` + +Take some images from the test data, obtain its feature vector from `discriminator.D[:11]` and plot images of the corresponding closest vectors in the feature space. + + +```python +feature_size = 8192 + +features = mx.nd.zeros((len(test_images), feature_size), ctx=ctx) + +for idx, image in enumerate(test_images): + + feature = discriminator(mx.nd.array(image)) + feature = feature.reshape(feature_size,) + features[idx,:] = feature.copyto(ctx) + + +for image in test_images[:100]: + + feature = discriminator(mx.nd.array(image)) + feature = feature.reshape((feature_size,)) + image = image.reshape((3,64,64)) + + + indices = get_knn(features, feature, k=10) + fig = plt.figure(figsize=(15,12)) + plt.subplot(1,10,1) + + visualize(image) + for i in range(2,9): + if indices[i-1][1] < 1.5: + plt.subplot(1,10,i) + sim = test_images[indices[i-1][0]].reshape(3,64,64) + visualize(sim) + plt.show() + plt.clf() +``` +![png](https://raw.githubusercontent.com/NRauschmayr/web-data/master/mxnet/doc/tutorials/info_gan/output.png) + +## How the Generator learns +We trained the Generator for a couple of epochs and stored a couple of fake images per epoch. Check the video. + ![alt text](https://raw.githubusercontent.com/NRauschmayr/web-data/master/mxnet/doc/tutorials/info_gan/infogan.gif) + + +The following function computes the TSNE on the feature matrix and stores the result in a json-file. This file can be loaded with [TSNEViewer](https://ml4a.github.io/guides/ImageTSNEViewer/) + + +```python +from sklearn.manifold import TSNE +from scipy.spatial import distance +import os +import json + +tsne = TSNE(n_components=2, learning_rate=150, perplexity=30, verbose=2).fit_transform(features.asnumpy()) +# save data to json +data = [] +counter = 0 +for i,f in enumerate(test_filenames): + + point = [float((tsne[i,k] - np.min(tsne[:,k]))/(np.max(tsne[:,k]) - np.min(tsne[:,k]))) for k in range(2) ] + data.append({"path": os.path.abspath(os.path.join(os.getcwd(),f)), "point": point}) + +with open("imagetsne.json", 'w') as outfile: + json.dump(data, outfile) +``` + +Load the file with TSNEViewer. You can now inspect whether similiar looking images are grouped nearby or not. + + diff --git a/docs/tutorials/index.md b/docs/tutorials/index.md index e4e74e383603..49dde321798e 100644 --- a/docs/tutorials/index.md +++ b/docs/tutorials/index.md @@ -57,7 +57,7 @@ Select API:  * [Logistic Regression](/tutorials/gluon/logistic_regression_explained.html) * [Word-level text generation with RNN, LSTM and GRU](http://gluon.mxnet.io/chapter05_recurrent-neural-networks/rnns-gluon.html) External link * [Visual Question Answering](http://gluon.mxnet.io/chapter08_computer-vision/visual-question-answer.html) External link - * [Image similiarity search with InfoGAN](/docs/tutorials/gluon/info_gan.md)External link + * [Image similiarity search with InfoGAN](/tutorials/gluon/info_gan.html) * Practitioner Guides * [Gotchas using NumPy](/tutorials/gluon/gotchas_numpy_in_mxnet.html) * [Multi-GPU training](http://gluon.mxnet.io/chapter07_distributed-learning/multiple-gpus-gluon.html) External link From 138e00d02fa0162e48ec3ad93e9a077a30a95c05 Mon Sep 17 00:00:00 2001 From: NRauschmayr Date: Tue, 6 Nov 2018 15:30:29 -0800 Subject: [PATCH 12/20] Updated index.md file --- docs/tutorials/gluon/info_gan.md.1 | 436 ----------------------------- 1 file changed, 436 deletions(-) delete mode 100644 docs/tutorials/gluon/info_gan.md.1 diff --git a/docs/tutorials/gluon/info_gan.md.1 b/docs/tutorials/gluon/info_gan.md.1 deleted file mode 100644 index 5625d44f9135..000000000000 --- a/docs/tutorials/gluon/info_gan.md.1 +++ /dev/null @@ -1,436 +0,0 @@ - -# Image similarity search with InfoGAN - -This notebook shows how to implement an InfoGAN based on Gluon. InfoGAN is an extension of GANs, where the generator input is split in 2 parts: random noise and a latent code c (see [InfoGAN Paper](https://arxiv.org/pdf/1606.03657.pdf)). -The codes are made meaningful by maximizing the mutual information between code and generator output. InfoGAN learns a disentangled representation in a completely unsupervised manner. It can be used for many applications such as image similarity search. This notebook uses the DCGAN example from the [Straight Dope Book](https://gluon.mxnet.io/chapter14_generative-adversarial-networks/dcgan.html) and extends it to create an InfoGAN. - - -```python -from __future__ import print_function -import sys -import os -import matplotlib as mpl -import tarfile -import matplotlib.image as mpimg -from matplotlib import pyplot as plt -from mxboard import SummaryWriter - -import mxnet as mx -from mxnet import gluon -from mxnet import ndarray as nd -from mxnet.gluon import nn, utils -from mxnet import autograd -import numpy as np - -from datetime import datetime -import time -import logging -``` - -The latent code vector c can contain several variables, which can be categorical and/or continuous. We set `n_continuous` to 2 and `n_categories` to 10. - - -```python -batch_size = 64 -z_dim = 100 -n_continuous = 2 -n_categories = 10 -ctx = mx.cpu() -``` - -Some functions to load and normalize images. - - -```python -lfw_url = 'http://vis-www.cs.umass.edu/lfw/lfw-deepfunneled.tgz' -data_path = 'lfw_dataset' -if not os.path.exists(data_path): - os.makedirs(data_path) - data_file = utils.download(lfw_url) - with tarfile.open(data_file) as tar: - tar.extractall(path=data_path) - - -``` - - -```python -def transform(data, width=64, height=64): - data = mx.image.imresize(data, width, height) - data = nd.transpose(data, (2,0,1)) - data = data.astype(np.float32)/127.5 - 1 - if data.shape[0] == 1: - data = nd.tile(data, (3, 1, 1)) - return data.reshape((1,) + data.shape) -``` - - -```python -def get_files(data_dir): - images = [] - filenames = [] - for path, _, fnames in os.walk(data_dir): - for fname in fnames: - if not fname.endswith('.jpg'): - continue - img = os.path.join(path, fname) - img_arr = mx.image.imread(img) - img_arr = transform(img_arr) - images.append(img_arr) - filenames.append(path + "/" + fname) - return images, filenames -``` - -Load the dataset `lfw_dataset` which contains images of celebrities. - - -```python -data_dir = 'lfw_dataset' -images, filenames = get_files(data_dir) -split = int(len(images)*0.8) -test_images = images[split:] -test_filenames = filenames[split:] -train_images = images[:split] -train_filenames = filenames[:split] - -train_data = mx.gluon.data.ArrayDataset(nd.concatenate(train_images)) -train_dataloader = mx.gluon.data.DataLoader(train_data, batch_size=batch_size, shuffle=True, last_batch='rollover', num_workers=4) -``` - -## Generator -Define the Generator model. Architecture is taken from the DCGAN implementation in [Straight Dope Book](https://gluon.mxnet.io/chapter14_generative-adversarial-networks/dcgan.html). The Generator consist of 4 layers where each layer involves a strided convolution, batch normalization, and rectified nonlinearity. It takes as input random noise and the latent code `c` and produces an `(64,64,3)` output image. - - -```python -class Generator(gluon.HybridBlock): - def __init__(self, **kwargs): - super(Generator, self).__init__(**kwargs) - with self.name_scope(): - self.prev = nn.HybridSequential() - self.prev.add(nn.Dense(1024, use_bias=False), nn.BatchNorm(), nn.Activation(activation='relu')) - self.G = nn.HybridSequential() - - self.G.add(nn.Conv2DTranspose(64 * 8, 4, 1, 0, use_bias=False)) - self.G.add(nn.BatchNorm()) - self.G.add(nn.Activation('relu')) - self.G.add(nn.Conv2DTranspose(64 * 4, 4, 2, 1, use_bias=False)) - self.G.add(nn.BatchNorm()) - self.G.add(nn.Activation('relu')) - self.G.add(nn.Conv2DTranspose(64 * 2, 4, 2, 1, use_bias=False)) - self.G.add(nn.BatchNorm()) - self.G.add(nn.Activation('relu')) - self.G.add(nn.Conv2DTranspose(64, 4, 2, 1, use_bias=False)) - self.G.add(nn.BatchNorm()) - self.G.add(nn.Activation('relu')) - self.G.add(nn.Conv2DTranspose(3, 4, 2, 1, use_bias=False)) - self.G.add(nn.Activation('tanh')) - - def hybrid_forward(self, F, x): - x = self.prev(x) - x = F.reshape(x, (0, -1, 1, 1)) - return self.G(x) -``` - -## Discriminator -Define the Discriminator and Q model. The Q model shares many layers with the Discriminator. Its task is to estimate the code $c$ for a given fake image. It is used to maximize the lower bound to the mutual information. - - -```python -class Discriminator(gluon.HybridBlock): - def __init__(self, **kwargs): - super(Discriminator, self).__init__(**kwargs) - with self.name_scope(): - self.D = nn.HybridSequential() - self.D.add(nn.Conv2D(64, 4, 2, 1, use_bias=False)) - self.D.add(nn.LeakyReLU(0.2)) - self.D.add(nn.Conv2D(64 * 2, 4, 2, 1, use_bias=False)) - self.D.add(nn.BatchNorm()) - self.D.add(nn.LeakyReLU(0.2)) - self.D.add(nn.Conv2D(64 * 4, 4, 2, 1, use_bias=False)) - self.D.add(nn.BatchNorm()) - self.D.add(nn.LeakyReLU(0.2)) - self.D.add(nn.Conv2D(64 * 8, 4, 2, 1, use_bias=False)) - self.D.add(nn.BatchNorm()) - self.D.add(nn.LeakyReLU(0.2)) - - self.D.add(nn.Dense(1024, use_bias=False), nn.BatchNorm(), nn.Activation(activation='relu')) - - self.prob = nn.Dense(1)#, activation='sigmoid') - self.feat = nn.HybridSequential() - self.feat.add(nn.Dense(128, use_bias=False), nn.BatchNorm(), nn.Activation(activation='relu')) - self.category_prob = nn.Dense(n_categories) - self.continuous_mean = nn.Dense(n_continuous) - self.Q = nn.HybridSequential() - self.Q.add(self.feat, self.category_prob, self.continuous_mean) - - def hybrid_forward(self, F, x): - x = self.D(x) - prob = self.prob(x) - feat = self.feat(x) - category_prob = self.category_prob(feat) - continuous_mean = self.continuous_mean(feat) - - return prob, category_prob, continuous_mean -``` - -The InfoGAN has the following layout. - -Discriminator and Generator are the same as in the DCGAN example. On top of the Disciminator is the Q model, which is estimating the code `c` for given fake images. The Generator's input is random noise and the latent code `c`. - -## Training Loop -Initialize Generator and Discriminator and define correspoing trainer function. - - -```python -generator = Generator() -generator.hybridize() -generator.initialize(mx.init.Normal(0.002), ctx=ctx) - -discriminator = Discriminator() -discriminator.hybridize() -discriminator.initialize(mx.init.Normal(0.002), ctx=ctx) - -lr = 0.0001 -beta = 0.5 - -g_trainer = gluon.Trainer(generator.collect_params(), 'adam', {'learning_rate': lr, 'beta1': beta}) -d_trainer = gluon.Trainer(discriminator.collect_params(), 'adam', {'learning_rate': lr, 'beta1': beta}) -q_trainer = gluon.Trainer(discriminator.Q.collect_params(), 'adam', {'learning_rate': lr, 'beta1': beta}) -``` - -Create vectors with real (=1) and fake labels (=0). - - -```python -real_label = nd.ones((batch_size,), ctx=ctx) -fake_label = nd.zeros((batch_size,),ctx=ctx) -``` - -Load a pertrained model. - - -```python -if os.path.isfile("infogan_d_latest.params") and os.path.isfile("infogan_g_latest.params"): - discriminator.load_parameters('infogan_d_latest.params', ctx=ctx, allow_missing=True, ignore_extra=True) - generator.load_parameters('infogan_g_latest.params', ctx=ctx, allow_missing=True, ignore_extra=True) -``` - -The latent code $c$ is part of the Generator input and it contains mutliple variables (continuous, categorical) that can represent different distributions. In order to make sure that the Generator uses the latent code, mutual information is introduced into the GAN loss term. Mutual information measures how much X is known given Y or vice versa. It is defined as: - -$$I(X;Y) = entropy(X) - entropy(X|Y) = entropy(Y) - entropy(Y|X) $$ - -The InfoGAN loss is: -$$\min_{G} \max_{D} \, V(D, G) - \lambda I(c, G(z, c))$$ - -where $V(D,G)$ is the GAN loss and the mutual information $I(c, G(z, c))$ goes in as regularization. The goal is to reach high mutual information, in order to learn meaningful codes $c$ for the data. - - -Define the loss functions. `SoftmaxCrossEntropyLoss` for the categorical code `c`, `L2Loss` for the continious code `c` and `SigmoidBinaryCrossEntropyLoss` for the normal GAN loss. - - -```python -loss1 = gluon.loss.SigmoidBinaryCrossEntropyLoss() -loss2 = gluon.loss.L2Loss() -loss3 = gluon.loss.SoftmaxCrossEntropyLoss() -``` - -This function samples `c`, `z`, and concatenates them to create the generator input. - - -```python -def create_generator_input(): - - #create random noise - z = mx.nd.random_normal(0, 1, shape=(batch_size, z_dim), ctx=ctx) - label = nd.array(np.random.randint(n_categories, size=batch_size)).as_in_context(ctx) - c1 = nd.one_hot(label, depth=n_categories).as_in_context(ctx) - c2 = nd.random.uniform(-1, 1, shape=(batch_size, n_continuous)).as_in_context(ctx) - - # concatenate random noise with c which will be the input of the generator - return mx.nd.concat(z, c1, c2, dim=1) -``` - -Define the training loop. -1. The discriminator receives `real_data` and `loss1` measures how many real images have been identified as real -2. The discriminator receives `fake_image` from the Generator and `loss1` measures how many fake images have been identified as fake -3. Update Discriminator -4. The updated discriminator receives `fake_image` and `loss1` measures how many fake images have been been identified as real, `loss2` measures the difference between the sampled continuous latent code `c` and the output of the Q model and `loss3` measures the difference between the sampled categorical latent code `c` and the output of the Q model. -4. Update Generator and Q - - -```python -with SummaryWriter(logdir='./logs/') as sw: - - epochs = 1 - i = 0 - for epoch in range(epochs): - print("Epoch", epoch) - starttime = time.time() - - d_error_epoch = mx.nd.zeros((1,), ctx=ctx) - g_error_epoch = mx.nd.zeros((1,), ctx=ctx) - - for idx, data in enumerate(train_dataloader): - i = i + 1 - - #get real data and generator input - real_data = data.as_in_context(ctx) - g_input = create_generator_input() - - - #Update discriminator: Input real data and fake data - with autograd.record(): - output_real,_,_ = discriminator(real_data) - d_error_real = loss1(output_real, real_label) - - # create fake image and input it to discriminator - fake_image = generator(g_input) - output_fake,_,_ = discriminator(fake_image.detach()) - d_error_fake = loss1(output_fake, fake_label) - - # total discriminator error - d_error = d_error_real + d_error_fake - - d_error_epoch += d_error.mean() - if i % 2 == 0: - d_error.backward() - d_trainer.step(data.shape[0]) - - #Update generator: Input random noise and latent code vector - with autograd.record(): - fake_image = generator(g_input) - output_fake, category_prob, continuous_mean = discriminator(fake_image) - g_error = loss1(output_fake, real_label) + loss3(category_prob, label) + loss2(c2, continuous_mean) - - g_error.backward() - g_error_epoch += g_error.mean() - - g_trainer.step(data.shape[0]) - q_trainer.step(data.shape[0]) - - # logging - if idx % 10 == 0: - - logging.info('speed: {} samples/s'.format(batch_size / (time.time() - starttime))) - logging.info('discriminator loss = %f, generator loss = %f at iter %d epoch %d' - %(d_error_epoch.asscalar()/idx,g_error_epoch.asscalar()/idx, idx, epoch)) - - g_input = create_generator_input() - - # create some fake image for logging in MXBoard - fake_image = generator(g_input) - - sw.add_scalar(tag='Loss_D', value={'test':d_error_epoch.asscalar()/idx}, global_step=i) - sw.add_scalar(tag='Loss_G', value={'test':d_error_epoch.asscalar()/idx}, global_step=i) - sw.add_image(tag='data_image', image=((fake_image[0]+ 1.0) * 127.5).astype(np.uint8) , global_step=i) - sw.flush() - - time1 = time.time() - - #discriminator.save_parameters("infogan_d.params") - #generator.save_parameters("infogan_g.params") -``` - -## Image similarity -Once the InfoGAN is trained, we can use the Discriminator to do an image similarity search. The idea is that the network learned meaningful features from the images based on the mutual information e.g. pose of people in an image. - -Load the trained discriminator and retrieve one of its last layers. - - -```python -discriminator = Discriminator() -discriminator.load_parameters("infogan_d_latest.params", ctx=ctx, ignore_extra=True) - -discriminator = discriminator.D[:11] -print (discriminator) - -discriminator.hybridize() -``` - -Nearest neighbor function, which takes a matrix of features and an input feature vector. It returns the 3 closest features. - - -```python -def get_knn(features, input_vector, k=3): - dist = (nd.square(features - input_vector).sum(axis=1))/features.shape[0] - print (np.sort(dist.asnumpy())[:10]) - indices = dist.asnumpy().argsort()[:k] - return [(index, dist[index].asscalar()) for index in indices] -``` - -A helper function to visualize image data. - - -```python -def visualize(img_array): - plt.imshow(((img_array.asnumpy().transpose(1, 2, 0) + 1.0) * 127.5).astype(np.uint8)) - plt.axis('off') -``` - -Take some images from the test data, obtain its feature vector from `discriminator.D[:11]` and plot images of the corresponding closest vectors in the feature space. - - -```python -feature_size = 8192 - -features = mx.nd.zeros((len(test_images), feature_size), ctx=ctx) - -for idx, image in enumerate(test_images): - - feature = discriminator(mx.nd.array(image)) - feature = feature.reshape(feature_size,) - features[idx,:] = feature.copyto(ctx) - - -for image in test_images[:100]: - - feature = discriminator(mx.nd.array(image)) - feature = feature.reshape((feature_size,)) - image = image.reshape((3,64,64)) - - - indices = get_knn(features, feature, k=10) - fig = plt.figure(figsize=(15,12)) - plt.subplot(1,10,1) - - visualize(image) - for i in range(2,9): - if indices[i-1][1] < 1.5: - plt.subplot(1,10,i) - sim = test_images[indices[i-1][0]].reshape(3,64,64) - visualize(sim) - plt.show() - plt.clf() -``` -![png](https://raw.githubusercontent.com/NRauschmayr/web-data/master/mxnet/doc/tutorials/info_gan/output.png) - -## How the Generator learns -We trained the Generator for a couple of epochs and stored a couple of fake images per epoch. Check the video. - ![alt text](https://raw.githubusercontent.com/NRauschmayr/web-data/master/mxnet/doc/tutorials/info_gan/infogan.gif) - - -The following function computes the TSNE on the feature matrix and stores the result in a json-file. This file can be loaded with [TSNEViewer](https://ml4a.github.io/guides/ImageTSNEViewer/) - - -```python -from sklearn.manifold import TSNE -from scipy.spatial import distance -import os -import json - -tsne = TSNE(n_components=2, learning_rate=150, perplexity=30, verbose=2).fit_transform(features.asnumpy()) -# save data to json -data = [] -counter = 0 -for i,f in enumerate(test_filenames): - - point = [float((tsne[i,k] - np.min(tsne[:,k]))/(np.max(tsne[:,k]) - np.min(tsne[:,k]))) for k in range(2) ] - data.append({"path": os.path.abspath(os.path.join(os.getcwd(),f)), "point": point}) - -with open("imagetsne.json", 'w') as outfile: - json.dump(data, outfile) -``` - -Load the file with TSNEViewer. You can now inspect whether similiar looking images are grouped nearby or not. - - From f4942c8c1cf40b89d46e10eaeb990538b3aac0e3 Mon Sep 17 00:00:00 2001 From: NRauschmayr Date: Tue, 6 Nov 2018 17:31:03 -0800 Subject: [PATCH 13/20] change links --- docs/tutorials/gluon/info_gan.md | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/docs/tutorials/gluon/info_gan.md b/docs/tutorials/gluon/info_gan.md index 31e13b43b85d..a871bb9eeeb0 100644 --- a/docs/tutorials/gluon/info_gan.md +++ b/docs/tutorials/gluon/info_gan.md @@ -171,7 +171,7 @@ class Discriminator(gluon.HybridBlock): ``` The InfoGAN has the following layout. - + Discriminator and Generator are the same as in the DCGAN example. On top of the Disciminator is the Q model, which is estimating the code `c` for given fake images. The Generator's input is random noise and the latent code `c`. ## Training Loop @@ -214,11 +214,11 @@ if os.path.isfile("infogan_d_latest.params") and os.path.isfile("infogan_g_lates The latent code $c$ is part of the Generator input and it contains mutliple variables (continuous, categorical) that can represent different distributions. In order to make sure that the Generator uses the latent code, mutual information is introduced into the GAN loss term. Mutual information measures how much X is known given Y or vice versa. It is defined as: -![gif](https://raw.githubusercontent.com/NRauschmayr/web-data/master/mxnet/doc/tutorials/info_gan/entropy.gif) +![gif](https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/doc/tutorials/info_gan/entropy.gif) The InfoGAN loss is: -![gif](https://raw.githubusercontent.com/NRauschmayr/web-data/master/mxnet/doc/tutorials/info_gan/loss.gif) +![gif](https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/doc/tutorials/info_gan/loss.gif) where `V(D,G)` is the GAN loss and the mutual information `I(c, G(z, c))` goes in as regularization. The goal is to reach high mutual information, in order to learn meaningful codes `c` for the data. @@ -401,11 +401,11 @@ for image in test_images[:100]: plt.show() plt.clf() ``` -![png](https://raw.githubusercontent.com/NRauschmayr/web-data/master/mxnet/doc/tutorials/info_gan/output.png) +![png](https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/doc/tutorials/info_gan/output.png) ## How the Generator learns We trained the Generator for a couple of epochs and stored a couple of fake images per epoch. Check the video. - ![alt text](https://raw.githubusercontent.com/NRauschmayr/web-data/master/mxnet/doc/tutorials/info_gan/infogan.gif) + ![alt text](https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/doc/tutorials/info_gan/infogan.gif) The following function computes the TSNE on the feature matrix and stores the result in a json-file. This file can be loaded with [TSNEViewer](https://ml4a.github.io/guides/ImageTSNEViewer/) From ec4cd9b20a5c49e0767deaa03e58ef9304cbff2f Mon Sep 17 00:00:00 2001 From: NRauschmayr Date: Wed, 7 Nov 2018 09:03:05 -0800 Subject: [PATCH 14/20] Fixed typo --- docs/tutorials/gluon/Untitled.ipynb | 6 ++++++ docs/tutorials/gluon/info_gan.md | 2 +- 2 files changed, 7 insertions(+), 1 deletion(-) create mode 100644 docs/tutorials/gluon/Untitled.ipynb diff --git a/docs/tutorials/gluon/Untitled.ipynb b/docs/tutorials/gluon/Untitled.ipynb new file mode 100644 index 000000000000..2fd64429bf42 --- /dev/null +++ b/docs/tutorials/gluon/Untitled.ipynb @@ -0,0 +1,6 @@ +{ + "cells": [], + "metadata": {}, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/docs/tutorials/gluon/info_gan.md b/docs/tutorials/gluon/info_gan.md index a871bb9eeeb0..7ab49357cac2 100644 --- a/docs/tutorials/gluon/info_gan.md +++ b/docs/tutorials/gluon/info_gan.md @@ -212,7 +212,7 @@ if os.path.isfile("infogan_d_latest.params") and os.path.isfile("infogan_g_lates generator.load_parameters('infogan_g_latest.params', ctx=ctx, allow_missing=True, ignore_extra=True) ``` -The latent code $c$ is part of the Generator input and it contains mutliple variables (continuous, categorical) that can represent different distributions. In order to make sure that the Generator uses the latent code, mutual information is introduced into the GAN loss term. Mutual information measures how much X is known given Y or vice versa. It is defined as: +The latent code `c` is part of the Generator input and it contains mutliple variables (continuous, categorical) that can represent different distributions. In order to make sure that the Generator uses the latent code, mutual information is introduced into the GAN loss term. Mutual information measures how much X is known given Y or vice versa. It is defined as: ![gif](https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/doc/tutorials/info_gan/entropy.gif) From 30579f6544709c1a6c08ddc4bc8510ff359229ca Mon Sep 17 00:00:00 2001 From: Nathalie Rauschmayr Date: Wed, 7 Nov 2018 10:41:47 -0800 Subject: [PATCH 15/20] Delete Untitled.ipynb --- docs/tutorials/gluon/Untitled.ipynb | 6 ------ 1 file changed, 6 deletions(-) delete mode 100644 docs/tutorials/gluon/Untitled.ipynb diff --git a/docs/tutorials/gluon/Untitled.ipynb b/docs/tutorials/gluon/Untitled.ipynb deleted file mode 100644 index 2fd64429bf42..000000000000 --- a/docs/tutorials/gluon/Untitled.ipynb +++ /dev/null @@ -1,6 +0,0 @@ -{ - "cells": [], - "metadata": {}, - "nbformat": 4, - "nbformat_minor": 2 -} From 09b5af58b1603659315958e54fc142a598755d01 Mon Sep 17 00:00:00 2001 From: Nathalie Rauschmayr Date: Wed, 7 Nov 2018 13:49:18 -0800 Subject: [PATCH 16/20] Adding Vishaals comments --- docs/tutorials/gluon/info_gan.md | 64 ++++++++++++++++---------------- 1 file changed, 31 insertions(+), 33 deletions(-) diff --git a/docs/tutorials/gluon/info_gan.md b/docs/tutorials/gluon/info_gan.md index 7ab49357cac2..784d5faa7aeb 100644 --- a/docs/tutorials/gluon/info_gan.md +++ b/docs/tutorials/gluon/info_gan.md @@ -1,31 +1,32 @@ # Image similarity search with InfoGAN -This notebook shows how to implement an InfoGAN based on Gluon. InfoGAN is an extension of GANs, where the generator input is split in 2 parts: random noise and a latent code c (see [InfoGAN Paper](https://arxiv.org/pdf/1606.03657.pdf)). +This notebook shows how to implement an InfoGAN based on Gluon. InfoGAN is an extension of GANs, where the generator input is split in 2 parts: random noise and a latent code (see [InfoGAN Paper](https://arxiv.org/pdf/1606.03657.pdf)). The codes are made meaningful by maximizing the mutual information between code and generator output. InfoGAN learns a disentangled representation in a completely unsupervised manner. It can be used for many applications such as image similarity search. This notebook uses the DCGAN example from the [Straight Dope Book](https://gluon.mxnet.io/chapter14_generative-adversarial-networks/dcgan.html) and extends it to create an InfoGAN. ```python from __future__ import print_function from datetime import datetime -import sys -import os import logging -import time +import multiprocessing +import os +import sys import tarfile +import time +import numpy as np from matplotlib import pyplot as plt +from mxboard import SummaryWriter import mxnet as mx from mxnet import gluon from mxnet import ndarray as nd from mxnet.gluon import nn, utils from mxnet import autograd -from mxboard import SummaryWriter -import numpy as np ``` -The latent code vector c can contain several variables, which can be categorical and/or continuous. We set `n_continuous` to 2 and `n_categories` to 10. +The latent code vector can contain several variables, which can be categorical and/or continuous. We set `n_continuous` to 2 and `n_categories` to 10. ```python @@ -82,20 +83,20 @@ Load the dataset `lfw_dataset` which contains images of celebrities. ```python -data_dir = 'lfw_dataset' +data_dir = 'lfw_dataset' images, filenames = get_files(data_dir) -split = int(len(images)*0.8) -test_images = images[split:] -test_filenames = filenames[split:] -train_images = images[:split] -train_filenames = filenames[:split] - -train_data = gluon.data.ArrayDataset(nd.concatenate(train_images)) -train_dataloader = gluon.data.DataLoader(train_data, batch_size=batch_size, shuffle=True, last_batch='rollover', num_workers=4) +split = int(len(images)*0.8) +test_images = images[split:] +test_filenames = filenames[split:] +train_images = images[:split] +train_filenames = filenames[:split] + +train_data = gluon.data.ArrayDataset(nd.concatenate(train_images)) +train_dataloader = gluon.data.DataLoader(train_data, batch_size=batch_size, shuffle=True, last_batch='rollover', num_workers=multiprocessing.cpu_count()) ``` ## Generator -Define the Generator model. Architecture is taken from the DCGAN implementation in [Straight Dope Book](https://gluon.mxnet.io/chapter14_generative-adversarial-networks/dcgan.html). The Generator consist of 4 layers where each layer involves a strided convolution, batch normalization, and rectified nonlinearity. It takes as input random noise and the latent code `c` and produces an `(64,64,3)` output image. +Define the Generator model. Architecture is taken from the DCGAN implementation in [Straight Dope Book](https://gluon.mxnet.io/chapter14_generative-adversarial-networks/dcgan.html). The Generator consist of 4 layers where each layer involves a strided convolution, batch normalization, and rectified nonlinearity. It takes as input random noise and the latent code and produces an `(64,64,3)` output image. ```python @@ -203,16 +204,16 @@ real_label = nd.ones((batch_size,), ctx=ctx) fake_label = nd.zeros((batch_size,),ctx=ctx) ``` -Load a pertrained model. +Load a pretrained model. ```python -if os.path.isfile("infogan_d_latest.params") and os.path.isfile("infogan_g_latest.params"): +if os.path.isfile('infogan_d_latest.params') and os.path.isfile('infogan_g_latest.params'): discriminator.load_parameters('infogan_d_latest.params', ctx=ctx, allow_missing=True, ignore_extra=True) generator.load_parameters('infogan_g_latest.params', ctx=ctx, allow_missing=True, ignore_extra=True) ``` -The latent code `c` is part of the Generator input and it contains mutliple variables (continuous, categorical) that can represent different distributions. In order to make sure that the Generator uses the latent code, mutual information is introduced into the GAN loss term. Mutual information measures how much X is known given Y or vice versa. It is defined as: +The latent code is part of the Generator input and it contains mutliple variables (continuous, categorical) that can represent different distributions. In order to make sure that the Generator uses the latent code, mutual information is introduced into the GAN loss term. Mutual information measures how much X is known given Y or vice versa. It is defined as: ![gif](https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/doc/tutorials/info_gan/entropy.gif) @@ -220,10 +221,10 @@ The InfoGAN loss is: ![gif](https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/doc/tutorials/info_gan/loss.gif) -where `V(D,G)` is the GAN loss and the mutual information `I(c, G(z, c))` goes in as regularization. The goal is to reach high mutual information, in order to learn meaningful codes `c` for the data. +where `V(D,G)` is the GAN loss and the mutual information `I(c, G(z, c))` goes in as regularization. The goal is to reach high mutual information, in order to learn meaningful codes for the data. -Define the loss functions. `SoftmaxCrossEntropyLoss` for the categorical code `c`, `L2Loss` for the continious code `c` and `SigmoidBinaryCrossEntropyLoss` for the normal GAN loss. +Define the loss functions. `SoftmaxCrossEntropyLoss` for the categorical code, `L2Loss` for the continious code and `SigmoidBinaryCrossEntropyLoss` for the normal GAN loss. ```python @@ -260,7 +261,7 @@ Define the training loop. with SummaryWriter(logdir='./logs/') as sw: epochs = 1 - i = 0 + counter = 0 for epoch in range(epochs): print("Epoch", epoch) starttime = time.time() @@ -269,8 +270,7 @@ with SummaryWriter(logdir='./logs/') as sw: g_error_epoch = nd.zeros((1,), ctx=ctx) for idx, data in enumerate(train_dataloader): - i = i + 1 - + #get real data and generator input real_data = data.as_in_context(ctx) g_input = create_generator_input() @@ -292,9 +292,9 @@ with SummaryWriter(logdir='./logs/') as sw: d_error_epoch += d_error.mean() #Update D every second iteration - if i % 2 == 0: + if (counter+1) % 2 == 0: d_error.backward() - d_trainer.step(data.shape[0]) + d_trainer.step(batch_size) #Update generator: Input random noise and latent code vector with autograd.record(): @@ -305,8 +305,8 @@ with SummaryWriter(logdir='./logs/') as sw: g_error.backward() g_error_epoch += g_error.mean() - g_trainer.step(data.shape[0]) - q_trainer.step(data.shape[0]) + g_trainer.step(batch_size) + q_trainer.step(batch_size) # logging if idx % 10 == 0: @@ -322,11 +322,9 @@ with SummaryWriter(logdir='./logs/') as sw: sw.add_scalar(tag='Loss_D', value={'test':d_error_epoch.asscalar()/idx}, global_step=i) sw.add_scalar(tag='Loss_G', value={'test':d_error_epoch.asscalar()/idx}, global_step=i) - sw.add_image(tag='data_image', image=((fake_image[0]+ 1.0) * 127.5).astype(np.uint8) , global_step=i) + sw.add_image(tag='data_image', image=((fake_image[0]+ 1.0) * 127.5).astype(np.uint8) , global_step=counter) sw.flush() - - time1 = time.time() - + discriminator.save_parameters("infogan_d_latest.params") generator.save_parameters("infogan_g_latest.params") ``` From 5b9cd7f6166334aaba9a9ea2216f8a8ccbe143f4 Mon Sep 17 00:00:00 2001 From: Nathalie Rauschmayr Date: Wed, 7 Nov 2018 14:06:01 -0800 Subject: [PATCH 17/20] Adding Anirudh's comments --- docs/tutorials/gluon/info_gan.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/docs/tutorials/gluon/info_gan.md b/docs/tutorials/gluon/info_gan.md index 784d5faa7aeb..b01fc11372a5 100644 --- a/docs/tutorials/gluon/info_gan.md +++ b/docs/tutorials/gluon/info_gan.md @@ -246,7 +246,7 @@ def create_generator_input(): c2 = nd.random.uniform(-1, 1, shape=(batch_size, n_continuous)).as_in_context(ctx) # concatenate random noise with c which will be the input of the generator - return nd.concat(z, c1, c2, dim=1) + return nd.concat(z, c1, c2, dim=1), label, c2 ``` Define the training loop. @@ -273,7 +273,7 @@ with SummaryWriter(logdir='./logs/') as sw: #get real data and generator input real_data = data.as_in_context(ctx) - g_input = create_generator_input() + g_input, label, c2 = create_generator_input() #Update discriminator: Input real data and fake data @@ -298,7 +298,7 @@ with SummaryWriter(logdir='./logs/') as sw: #Update generator: Input random noise and latent code vector with autograd.record(): - fake_image = generator(g_input) + fake_image,_,_ = generator(g_input) output_fake, category_prob, continuous_mean = discriminator(fake_image) g_error = loss1(output_fake, real_label) + loss3(category_prob, label) + loss2(c2, continuous_mean) @@ -374,7 +374,7 @@ features = nd.zeros((len(test_images), feature_size), ctx=ctx) for idx, image in enumerate(test_images): - feature = discriminator(nd.array(image)) + feature = discriminator(nd.array(image, ctx=ctx)) feature = feature.reshape(feature_size,) features[idx,:] = feature.copyto(ctx) From 797699d5d992873e1be6dd7de3a9a9ef5bb83164 Mon Sep 17 00:00:00 2001 From: Nathalie Rauschmayr Date: Wed, 7 Nov 2018 14:35:47 -0800 Subject: [PATCH 18/20] Fixed some bugs --- docs/tutorials/gluon/info_gan.md | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/docs/tutorials/gluon/info_gan.md b/docs/tutorials/gluon/info_gan.md index b01fc11372a5..bb936f0fa12f 100644 --- a/docs/tutorials/gluon/info_gan.md +++ b/docs/tutorials/gluon/info_gan.md @@ -298,7 +298,7 @@ with SummaryWriter(logdir='./logs/') as sw: #Update generator: Input random noise and latent code vector with autograd.record(): - fake_image,_,_ = generator(g_input) + fake_image = generator(g_input) output_fake, category_prob, continuous_mean = discriminator(fake_image) g_error = loss1(output_fake, real_label) + loss3(category_prob, label) + loss2(c2, continuous_mean) @@ -315,13 +315,13 @@ with SummaryWriter(logdir='./logs/') as sw: logging.info('discriminator loss = %f, generator loss = %f at iter %d epoch %d' %(d_error_epoch.asscalar()/idx,g_error_epoch.asscalar()/idx, idx, epoch)) - g_input = create_generator_input() + g_input,_,_ = create_generator_input() # create some fake image for logging in MXBoard fake_image = generator(g_input) - sw.add_scalar(tag='Loss_D', value={'test':d_error_epoch.asscalar()/idx}, global_step=i) - sw.add_scalar(tag='Loss_G', value={'test':d_error_epoch.asscalar()/idx}, global_step=i) + sw.add_scalar(tag='Loss_D', value={'test':d_error_epoch.asscalar()/idx}, global_step=counter) + sw.add_scalar(tag='Loss_G', value={'test':d_error_epoch.asscalar()/idx}, global_step=counter) sw.add_image(tag='data_image', image=((fake_image[0]+ 1.0) * 127.5).astype(np.uint8) , global_step=counter) sw.flush() @@ -381,7 +381,7 @@ for idx, image in enumerate(test_images): for image in test_images[:100]: - feature = discriminator(mx.nd.array(image)) + feature = discriminator(mx.nd.array(image, ctx=ctx)) feature = feature.reshape((feature_size,)) image = image.reshape((3,64,64)) From 077ee69a793388c62d45b70d8958262ba0117907 Mon Sep 17 00:00:00 2001 From: Nathalie Rauschmayr Date: Thu, 8 Nov 2018 21:13:56 -0800 Subject: [PATCH 19/20] Adding Anirudh's comments --- docs/tutorials/gluon/info_gan.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tutorials/gluon/info_gan.md b/docs/tutorials/gluon/info_gan.md index bb936f0fa12f..00fa594b1c65 100644 --- a/docs/tutorials/gluon/info_gan.md +++ b/docs/tutorials/gluon/info_gan.md @@ -212,7 +212,7 @@ if os.path.isfile('infogan_d_latest.params') and os.path.isfile('infogan_g_lates discriminator.load_parameters('infogan_d_latest.params', ctx=ctx, allow_missing=True, ignore_extra=True) generator.load_parameters('infogan_g_latest.params', ctx=ctx, allow_missing=True, ignore_extra=True) ``` - +There are 2 differences between InfoGAN and DCGAN: the extra latent code and the Q network to estimate the code. The latent code is part of the Generator input and it contains mutliple variables (continuous, categorical) that can represent different distributions. In order to make sure that the Generator uses the latent code, mutual information is introduced into the GAN loss term. Mutual information measures how much X is known given Y or vice versa. It is defined as: ![gif](https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/doc/tutorials/info_gan/entropy.gif) From 9d6bc51e4bcaab36420dafb3c13032e088301553 Mon Sep 17 00:00:00 2001 From: Nathalie Rauschmayr Date: Sun, 11 Nov 2018 09:09:54 -0800 Subject: [PATCH 20/20] some minor fixes --- docs/tutorials/gluon/info_gan.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/docs/tutorials/gluon/info_gan.md b/docs/tutorials/gluon/info_gan.md index 00fa594b1c65..c8f07c6fda35 100644 --- a/docs/tutorials/gluon/info_gan.md +++ b/docs/tutorials/gluon/info_gan.md @@ -92,7 +92,7 @@ train_images = images[:split] train_filenames = filenames[:split] train_data = gluon.data.ArrayDataset(nd.concatenate(train_images)) -train_dataloader = gluon.data.DataLoader(train_data, batch_size=batch_size, shuffle=True, last_batch='rollover', num_workers=multiprocessing.cpu_count()) +train_dataloader = gluon.data.DataLoader(train_data, batch_size=batch_size, shuffle=True, last_batch='rollover', num_workers=multiprocessing.cpu_count()-1) ``` ## Generator @@ -173,6 +173,7 @@ class Discriminator(gluon.HybridBlock): The InfoGAN has the following layout. + Discriminator and Generator are the same as in the DCGAN example. On top of the Disciminator is the Q model, which is estimating the code `c` for given fake images. The Generator's input is random noise and the latent code `c`. ## Training Loop