andreaazzini
diff --git a/Diff for: ‎README.md
+44-4 b/Diff for: ‎README.md
+44-4
diff --git a/Diff for: ‎config.py
-1 b/Diff for: ‎config.py
-1
diff --git a/Diff for: ‎src/convnet.py
+23-3 b/Diff for: ‎src/convnet.py
+23-3
diff --git a/Diff for: ‎src/initializer.py
-9 b/Diff for: ‎src/initializer.py
-9
diff --git a/Diff for: ‎src/models.py
+73-31 b/Diff for: ‎src/models.py
+73-31
diff --git a/Diff for: ‎src/scalar_ops.py
+10 b/Diff for: ‎src/scalar_ops.py
+10
diff --git a/Diff for: ‎src/test.py
+16-17 b/Diff for: ‎src/test.py
+16-17
@@ -4,12 +4,52 @@ SegNet is a TensorFlow implementation of the [segmentation network proposed by K
 
 ## Configuration
 
-Before running, download the [VGG16 weights file](https://www.cs.toronto.edu/~frossard/vgg16/vgg16_weights.npz)
-and save it as `input/vgg16_weights.npz` if you want to initialize the encoder weights with the VGG16 ones trained on ImageNet classification dataset.
+Create a `config.py` file, containing color maps, working dataset and other options.
 
-In `config.py`, choose your working dataset. The dataset name needs to match the data directories you create in your `input` folder.
+```
+colors = {
+  'segnet-32': [
+    [64, 128, 64],   # Animal
+    [192, 0, 128],   # Archway
+    [0, 128, 192],   # Bicyclist
+    [0, 128, 64],    # Bridge
+    [128, 0, 0],     # Building
+    [64, 0, 128],    # Car
+    [64, 0, 192],    # CartLuggagePram
+    [192, 128, 64],  # Child
+    [192, 192, 128], # Column_Pole
+    [64, 64, 128],   # Fence
+    [128, 0, 192],   # LaneMkgsDriv
+    [192, 0, 64],    # LaneMkgsNonDriv
+    [128, 128, 64],  # Misc_Text
+    [192, 0, 192],   # MotorcycleScooter
+    [128, 64, 64],   # OtherMoving
+    [64, 192, 128],  # ParkingBlock
+    [64, 64, 0],     # Pedestrian
+    [128, 64, 128],  # Road
+    [128, 128, 192], # RoadShoulder
+    [0, 0, 192],     # Sidewalk
+    [192, 128, 128], # SignSymbol
+    [128, 128, 128], # Sky
+    [64, 128, 192],  # SUVPickupTruck
+    [0, 0, 64],      # TrafficCone
+    [0, 64, 64],     # TrafficLight
+    [192, 64, 128],  # Train
+    [128, 128, 0],   # Tree
+    [192, 128, 192], # Truck_Bus
+    [64, 0, 64],     # Tunnel
+    [192, 192, 0],   # VegetationMisc
+    [0, 0, 0],       # Void
+    [64, 192, 0]     # Wall
+  ]
+}
+gpu_memory_fraction = 0.7
+working_dataset = 'segnet-32'
+```
+
+The `dataset_name` needs to match the data directories you create in your `input` folder.
 You can use `segnet-32` and `segnet-13` to replicate the aforementioned Kendall et al. experiments.
 
 ## Train and test
 
-Train SegNet with `python -m src/train.py`. Analogously, test it with `python -m src/test.py`. 
+Train SegNet with `python src/train.py`. Analogously, test it with `python src/test.py`.
@@ -1,12 +1,18 @@
 import tensorflow as tf
 
-def conv(x, receptive_field_shape, channels_shape, stride, name):
+def conv(x, receptive_field_shape, channels_shape, stride, name, repad=False):
   kernel_shape = receptive_field_shape + channels_shape
   bias_shape = [channels_shape[-1]]
 
   weights = tf.get_variable('%s_W' % name, kernel_shape, initializer=tf.truncated_normal_initializer(stddev=.1))
   biases = tf.get_variable('%s_b' % name, bias_shape, initializer=tf.constant_initializer(.1))
-  conv = tf.nn.conv2d(x, weights, strides=[1, stride, stride, 1], padding='SAME')
+
+  if repad:
+    padded = tf.pad(x, [[0, 0], [1, 1], [1, 1], [0, 0]], mode='SYMMETRIC')
+    conv = tf.nn.conv2d(padded, weights, strides=[1, stride, stride, 1], padding='VALID')
+  else:
+    conv = tf.nn.conv2d(x, weights, strides=[1, stride, stride, 1], padding='SAME')
+
   conv_bias = tf.nn.bias_add(conv, biases)
   return tf.nn.relu(tf.contrib.layers.batch_norm(conv_bias))
 
@@ -21,9 +27,23 @@ def deconv(x, receptive_field_shape, channels_shape, stride, name):
 
   weights = tf.get_variable('%s_W' % name, kernel_shape, initializer=tf.truncated_normal_initializer(stddev=.1))
   biases = tf.get_variable('%s_b' % name, bias_shape, initializer=tf.constant_initializer(.1))
-  conv = tf.nn.conv2d_transpose(x, weights, [batch_size, height, width, channels_shape[0]], [1, stride, stride, 1], padding='SAME')
+  conv = tf.nn.conv2d_transpose(x, weights, [batch_size, height * stride, width * stride, channels_shape[0]], [1, stride, stride, 1], padding='SAME')
   conv_bias = tf.nn.bias_add(conv, biases)
   return tf.nn.relu(tf.contrib.layers.batch_norm(conv_bias))
 
 def max_pool(x, size, stride, padding='SAME'):
   return tf.nn.max_pool(x, ksize=[1, size, size, 1], strides=[1, stride, stride, 1], padding=padding, name='maxpool')
+
+def unpool(x, size):
+  out = tf.concat_v2([x, tf.zeros_like(x)], 3)
+  out = tf.concat_v2([out, tf.zeros_like(out)], 2)
+
+  sh = x.get_shape().as_list()
+  if None not in sh[1:]:
+    out_size = [-1, sh[1] * size, sh[2] * size, sh[3]]
+    return tf.reshape(out, out_size)
+
+  shv = tf.shape(x)
+  ret = tf.reshape(out, tf.stack([-1, shv[1] * size, shv[2] * size, sh[3]]))
+  ret.set_shape([None, None, None, sh[3]])
+  return ret
@@ -3,31 +3,35 @@
 import tensorflow as tf
 
 class SegNetAutoencoder:
-  def __init__(self, n, max_images=3):
-    self.params = []
-    self.n = n
+  def __init__(self, n, strided=False, max_images=3):
     self.max_images = max_images
+    self.n = n
+    self.strided = strided
 
   def conv(self, x, channels_shape, name):
     return cnn.conv(x, [3, 3], channels_shape, 1, name)
 
+  def conv2(self, x, channels_shape, name):
+    return cnn.conv(x, [3, 3], channels_shape, 2, name)
+
   def deconv(self, x, channels_shape, name):
     return cnn.deconv(x, [3, 3], channels_shape, 1, name)
 
   def pool(self, x):
     return cnn.max_pool(x, 2, 2)
 
-  def unpool(self, bottom):
-    sh = bottom.get_shape().as_list()
-    dim = len(sh[1:-1])
-    out = tf.reshape(bottom, [-1] + sh[-dim:])
-    for i in range(dim, 0, -1):
-      out = tf.concat(i, [out, tf.zeros_like(out)])
-    out_size = [-1] + [s * 2 for s in sh[1:-1]] + [sh[-1]]
-    return tf.reshape(out, out_size)
-  
-  def encode(self, images):
-    tf.image_summary('input', images, max_images=self.max_images)
+  def unpool(self, x):
+    return cnn.unpool(x, 2)
+
+  def resize_conv(self, x, channels_shape, name):
+    shape = x.get_shape().as_list()
+    height = shape[1] * 2
+    width = shape[2] * 2
+    resized = tf.image.resize_nearest_neighbor(x, [height, width])
+    return cnn.conv(resized, [3, 3], channels_shape, 1, name, repad=True)
+
+  def inference_with_pooling(self, images):
+    tf.summary.image('input', images, max_outputs=self.max_images)
 
     with tf.variable_scope('pool1'):
       conv1 = self.conv(images, [3, 64], 'conv1_1')
@@ -88,26 +92,64 @@ def decode(self, code):
       deconv12 = self.deconv(unpool5, [64, 64], 'deconv1_2')
       deconv13 = self.deconv(deconv12, [self.n, 64], 'deconv1_1')
 
-    rgb_output = classifier.rgb(deconv13)
-    tf.image_summary('output', rgb_output, max_images=self.max_images)
-
+    rgb_image = classifier.rgb(deconv13)
+    tf.summary.image('output', rgb_image, max_outputs=self.max_images)
     return deconv13
 
-  def prepare_encoder_parameters(self):
-    param_format = 'conv%d_%d_%s'
-    conv_layers = [2, 2, 3, 3, 3]
+  def strided_inference(self, images):
+    tf.summary.image('input', images, max_outputs=self.max_images)
+
+    with tf.variable_scope('pool1'):
+      conv1 = self.conv(images, [3, 64], 'conv1_1')
+      conv2 = self.conv2(conv1, [64, 64], 'conv1_2')
+
+    with tf.variable_scope('pool2'):
+      conv3 = self.conv(conv2, [64, 128], 'conv2_1')
+      conv4 = self.conv2(conv3, [128, 128], 'conv2_2')
+
+    with tf.variable_scope('pool3'):
+      conv5 = self.conv(conv4, [128, 256], 'conv3_1')
+      conv6 = self.conv(conv5, [256, 256], 'conv3_2')
+      conv7 = self.conv2(conv6, [256, 256], 'conv3_3')
+
+    with tf.variable_scope('pool4'):
+      conv8 = self.conv(conv7, [256, 512], 'conv4_1')
+      conv9 = self.conv(conv8, [512, 512], 'conv4_2')
+      conv10 = self.conv2(conv9, [512, 512], 'conv4_3')
+
+    with tf.variable_scope('pool5'):
+      conv11 = self.conv(conv10, [512, 512], 'conv5_1')
+      conv12 = self.conv(conv11, [512, 512], 'conv5_2')
+      conv13 = self.conv2(conv12, [512, 512], 'conv5_3')
+
+    with tf.variable_scope('unpool1'):
+      deconv1 = self.resize_conv(conv13, [512, 512], 'deconv5_3')
+      deconv2 = self.deconv(deconv1, [512, 512], 'deconv5_2')
+      deconv3 = self.deconv(deconv2, [512, 512], 'deconv5_1')
 
-    for pool in range(1, 6):
-      with tf.variable_scope('pool%d' % pool, reuse=True):
-        for conv in range(1, conv_layers[pool - 1] + 1):
-          weights = tf.get_variable(param_format % (pool, conv, 'W'))
-          biases = tf.get_variable(param_format % (pool, conv, 'b'))
-          self.params += [weights, biases]
+    with tf.variable_scope('unpool2'):
+      deconv4 = self.resize_conv(deconv3, [512, 512], 'deconv4_3')
+      deconv5 = self.deconv(deconv4, [512, 512], 'deconv4_2')
+      deconv6 = self.deconv(deconv5, [256, 512], 'deconv4_1')
 
-  def get_encoder_parameters(self):
-    return self.params
+    with tf.variable_scope('unpool3'):
+      deconv7 = self.resize_conv(deconv6, [256, 256], 'deconv3_3')
+      deconv8 = self.deconv(deconv7, [256, 256], 'deconv3_2')
+      deconv9 = self.deconv(deconv8, [128, 256], 'deconv3_1')
+
+    with tf.variable_scope('unpool4'):
+      deconv10 = self.resize_conv(deconv9, [128, 128], 'deconv2_2')
+      deconv11 = self.deconv(deconv10, [64, 128], 'deconv2_1')
+
+    with tf.variable_scope('unpool5'):
+      deconv12 = self.resize_conv(deconv11, [64, 64], 'deconv1_2')
+      deconv13 = self.deconv(deconv12, [self.n, 64], 'deconv1_1')
+
+    rgb_image = classifier.rgb(deconv13)
+    tf.summary.image('output', rgb_image, max_outputs=self.max_images)
+    return deconv13
 
   def inference(self, images):
-    code = self.encode(images)
-    self.prepare_encoder_parameters()
-    return self.decode(code)
+    if self.strided:
+      return self.strided_inference(images)
+    return self.inference_with_pooling(images)
@@ -0,0 +1,10 @@
+import tensorflow as tf
+
+def accuracy(logits, labels, batch_size):
+  equal_pixels = tf.reduce_sum(tf.to_float(tf.equal(logits, labels)))
+  total_pixels = batch_size * 224 * 224 * 3
+  return equal_pixels / total_pixels
+
+def loss(logits, labels):
+  cross_entropy = tf.nn.softmax_cross_entropy_with_logits(logits, labels)
+  return tf.reduce_mean(cross_entropy, name='loss')
@@ -1,43 +1,43 @@
 from inputs import inputs
 from models import SegNetAutoencoder
+from scalar_ops import accuracy, loss
 
 import classifier
 import config
 import tensorflow as tf
 import utils
 
-test_file = utils.get_test_set(config.working_dataset)
+test_file, test_labels_file = utils.get_test_set(config.working_dataset, include_labels=True)
 
-tf.app.flags.DEFINE_string('test', test_file, 'Test data')
 tf.app.flags.DEFINE_string('ckpt_dir', './ckpts', 'Train checkpoint directory')
-# tf.app.flags.DEFINE_string('test_labels', './input/test_labels.tfrecords', 'Test labels data')
+tf.app.flags.DEFINE_string('test', test_file, 'Test data')
+tf.app.flags.DEFINE_string('test_labels', test_labels_file, 'Test labels data')
 tf.app.flags.DEFINE_string('test_logs', './logs/test', 'Log directory')
 
-tf.app.flags.DEFINE_integer('batch', 35, 'Batch size')
+tf.app.flags.DEFINE_boolean('strided', True, 'Use strided convolutions and deconvolutions')
 
-FLAGS = tf.app.flags.FLAGS
+tf.app.flags.DEFINE_integer('batch', 200, 'Batch size')
 
-def accuracy(logits, labels):
-  equal_pixels = tf.reduce_sum(tf.to_float(tf.equal(logits, labels)))
-  total_pixels = tf.to_float(tf.reduce_prod(tf.shape(logits)))
-  return equal_pixels / total_pixels
+FLAGS = tf.app.flags.FLAGS
 
 def test():
-  #images, labels = inputs(FLAGS.batch, FLAGS.test, FLAGS.test_labels)
-  images = inputs(FLAGS.batch, FLAGS.test)
-  #one_hot_labels = classifier.one_hot(labels)
+  images, labels = inputs(FLAGS.batch, FLAGS.test, FLAGS.test_labels)
+  tf.summary.image('labels', labels)
+  one_hot_labels = classifier.one_hot(labels)
 
-  autoencoder = SegNetAutoencoder(2, max_images=20)
+  autoencoder = SegNetAutoencoder(4, strided=FLAGS.strided)
   logits = autoencoder.inference(images)
 
-  #accuracy_op = accuracy(logits, one_hot_labels)
-  #tf.scalar_summary('accuracy', accuracy_op)
+  accuracy_op = accuracy(logits, one_hot_labels, FLAGS.batch)
+  tf.scalar_summary('accuracy', accuracy_op)
 
   saver = tf.train.Saver(tf.global_variables())
   summary = tf.merge_all_summaries()
   summary_writer = tf.train.SummaryWriter(FLAGS.test_logs)
 
-  with tf.Session() as sess:
+  gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=config.gpu_memory_fraction)
+  session_config = tf.ConfigProto(allow_soft_placement=True, gpu_options=gpu_options)
+  with tf.Session(config=session_config) as sess:
     coord = tf.train.Coordinator()
     threads = tf.train.start_queue_runners(sess=sess, coord=coord)
 
@@ -50,7 +50,6 @@ def test():
     ckpt_path = ckpt.model_checkpoint_path
     saver.restore(sess, ckpt_path)
 
-    #accuracy_value, summary_str = sess.run([accuracy_op, summary])
     summary_str = sess.run(summary)
     summary_writer.add_summary(summary_str)
     summary_writer.flush()