From b1e850e556c285ed103516b61042e1394b3f5dee Mon Sep 17 00:00:00 2001 From: jbagnatoMacPro Date: Fri, 19 Jun 2020 16:06:19 +0200 Subject: [PATCH] Ejercicio Deteccion de objetos yolo con keras y tensorflow --- Ejercicio_Object_Detection.ipynb | 1795 ++++++++++++++++++++++++++++++ 1 file changed, 1795 insertions(+) create mode 100644 Ejercicio_Object_Detection.ipynb diff --git a/Ejercicio_Object_Detection.ipynb b/Ejercicio_Object_Detection.ipynb new file mode 100644 index 000000000..cda4d17ae --- /dev/null +++ b/Ejercicio_Object_Detection.ipynb @@ -0,0 +1,1795 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Entrena tu Dataset para Detección de Objetos" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Consigue el dataset con las foto de Lego y sus anotaciones comprando el libro https://leanpub.com/aprendeml/ (puedes descargarlo gratis)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Vision Por Ordenador en Machine Learning" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "ExecuteTime": { + "end_time": "2020-06-14T21:09:36.457551Z", + "start_time": "2020-06-14T21:09:36.451976Z" + } + }, + "source": [ + "Vamos a Crear una Red Neuronal para Detectar personajes de lego en imagenes, camara ó video.\n", + "\n", + "El articulo en el blog www.aprendemachinelearning.com\n", + "\n", + "Empecemos," + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#pip install tensorflow==1.13.2\n", + "#pip install keras==2.0.8\n", + "#pip install imgaug==0.2.5\n", + "#pip install opencv-python\n", + "#pip install h5py\n", + "#pip install tqdm\n", + "#pip install imutils\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "ExecuteTime": { + "end_time": "2020-06-18T17:03:38.758432Z", + "start_time": "2020-06-18T17:03:08.179312Z" + } + }, + "outputs": [], + "source": [ + "import argparse\n", + "import os\n", + "import numpy as np\n", + "import json\n", + "import cv2\n", + "import copy\n", + "import imgaug as ia\n", + "from imgaug import augmenters as iaa\n", + "from keras.utils import Sequence\n", + "import xml.etree.ElementTree as ET\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Definamos directorio de Annotations xml e imagenes" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "metadata": { + "ExecuteTime": { + "end_time": "2020-06-18T21:14:04.460356Z", + "start_time": "2020-06-18T21:14:04.455123Z" + } + }, + "outputs": [], + "source": [ + "xml_dir = \"annotation/lego4/\" # directorio que contiene los xml\n", + "img_dir = \"images/lego4/\" # directorios con las imagenes\n", + "labels = [\"lego\"]\n", + "tamanio = 416 # tamanio en pixeles para entrenar la red\n", + "mejores_pesos = \"red_lego.h5\"" + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "metadata": { + "ExecuteTime": { + "end_time": "2020-06-18T21:17:57.774752Z", + "start_time": "2020-06-18T21:17:57.758916Z" + } + }, + "outputs": [], + "source": [ + "def leer_annotations(ann_dir, img_dir, labels=[]):\n", + " all_imgs = []\n", + " seen_labels = {}\n", + " \n", + " for ann in [x for x in sorted(os.listdir(ann_dir)) if x.endswith('.xml')] :\n", + " img = {'object':[]}\n", + " \n", + " tree = ET.parse(ann_dir + ann)\n", + " \n", + " for elem in tree.iter():\n", + " if 'filename' in elem.tag:\n", + " img['filename'] = img_dir + elem.text\n", + " if 'width' in elem.tag:\n", + " img['width'] = int(elem.text)\n", + " if 'height' in elem.tag:\n", + " img['height'] = int(elem.text)\n", + " if 'object' in elem.tag or 'part' in elem.tag:\n", + " obj = {}\n", + " \n", + " for attr in list(elem):\n", + " if 'name' in attr.tag:\n", + " obj['name'] = attr.text\n", + "\n", + " if obj['name'] in seen_labels:\n", + " seen_labels[obj['name']] += 1\n", + " else:\n", + " seen_labels[obj['name']] = 1\n", + " \n", + " if len(labels) > 0 and obj['name'] not in labels:\n", + " break\n", + " else:\n", + " img['object'] += [obj]\n", + " \n", + " if 'bndbox' in attr.tag:\n", + " for dim in list(attr):\n", + " if 'xmin' in dim.tag:\n", + " obj['xmin'] = int(round(float(dim.text)))\n", + " if 'ymin' in dim.tag:\n", + " obj['ymin'] = int(round(float(dim.text)))\n", + " if 'xmax' in dim.tag:\n", + " obj['xmax'] = int(round(float(dim.text)))\n", + " if 'ymax' in dim.tag:\n", + " obj['ymax'] = int(round(float(dim.text)))\n", + "\n", + " if len(img['object']) > 0:\n", + " all_imgs += [img]\n", + " \n", + " return all_imgs, seen_labels\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Y las cargamos:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "ExecuteTime": { + "end_time": "2020-06-18T21:18:00.144781Z", + "start_time": "2020-06-18T21:18:00.105268Z" + } + }, + "outputs": [], + "source": [ + "train_imgs, train_labels = leer_annotations(xml_dir, img_dir, labels)\n", + "print('imagenes',len(train_imgs), 'labels',len(train_labels))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Separemos en Train y Validation" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "ExecuteTime": { + "end_time": "2020-06-18T21:18:01.825926Z", + "start_time": "2020-06-18T21:18:01.816814Z" + } + }, + "outputs": [], + "source": [ + "train_valid_split = int(0.8*len(train_imgs))\n", + "np.random.shuffle(train_imgs)\n", + "valid_imgs = train_imgs[train_valid_split:]\n", + "train_imgs = train_imgs[:train_valid_split]\n", + "print('train:',len(train_imgs), 'validate:',len(valid_imgs))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Data Augmentation" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "hagamos pequeñas distorciones a las imagenes de entrada para entrenar con mayor variedad y mejorar la precision de la red " + ] + }, + { + "cell_type": "code", + "execution_count": 44, + "metadata": { + "ExecuteTime": { + "end_time": "2020-06-18T21:18:05.410047Z", + "start_time": "2020-06-18T21:18:05.322797Z" + } + }, + "outputs": [], + "source": [ + "\n", + "def bbox_iou(box1, box2):\n", + " intersect_w = _interval_overlap([box1.xmin, box1.xmax], [box2.xmin, box2.xmax])\n", + " intersect_h = _interval_overlap([box1.ymin, box1.ymax], [box2.ymin, box2.ymax]) \n", + " \n", + " intersect = intersect_w * intersect_h\n", + "\n", + " w1, h1 = box1.xmax-box1.xmin, box1.ymax-box1.ymin\n", + " w2, h2 = box2.xmax-box2.xmin, box2.ymax-box2.ymin\n", + " \n", + " union = w1*h1 + w2*h2 - intersect\n", + " \n", + " return float(intersect) / union\n", + "\n", + "class BoundBox:\n", + " def __init__(self, xmin, ymin, xmax, ymax, c = None, classes = None):\n", + " self.xmin = xmin\n", + " self.ymin = ymin\n", + " self.xmax = xmax\n", + " self.ymax = ymax\n", + " \n", + " self.c = c\n", + " self.classes = classes\n", + "\n", + " self.label = -1\n", + " self.score = -1\n", + "\n", + " def get_label(self):\n", + " if self.label == -1:\n", + " self.label = np.argmax(self.classes)\n", + " \n", + " return self.label\n", + " \n", + " def get_score(self):\n", + " if self.score == -1:\n", + " self.score = self.classes[self.get_label()]\n", + " \n", + " return self.score\n", + "\n", + "\n", + "class BatchGenerator(Sequence):\n", + " def __init__(self, images, \n", + " config, \n", + " shuffle=True, \n", + " jitter=True, \n", + " norm=None):\n", + " self.generator = None\n", + "\n", + " self.images = images\n", + " self.config = config\n", + "\n", + " self.shuffle = shuffle\n", + " self.jitter = jitter\n", + " self.norm = norm\n", + "\n", + " self.anchors = [BoundBox(0, 0, config['ANCHORS'][2*i], config['ANCHORS'][2*i+1]) for i in range(int(len(config['ANCHORS'])//2))]\n", + "\n", + " ### augmentors by https://github.com/aleju/imgaug\n", + " sometimes = lambda aug: iaa.Sometimes(0.5, aug)\n", + "\n", + " self.aug_pipe = iaa.Sequential(\n", + " [\n", + " sometimes(iaa.Affine()),\n", + " iaa.SomeOf((0, 5),\n", + " [\n", + " iaa.OneOf([\n", + " iaa.GaussianBlur((0, 3.0)), # blur images with a sigma between 0 and 3.0\n", + " iaa.AverageBlur(k=(2, 7)), # blur image using local means with kernel sizes between 2 and 7\n", + " iaa.MedianBlur(k=(3, 11)), # blur image using local medians with kernel sizes between 2 and 7\n", + " ]),\n", + " iaa.Sharpen(alpha=(0, 1.0), lightness=(0.75, 1.5)), # sharpen images\n", + " iaa.AdditiveGaussianNoise(loc=0, scale=(0.0, 0.05*255), per_channel=0.5), # add gaussian noise to images\n", + " iaa.OneOf([\n", + " iaa.Dropout((0.01, 0.1), per_channel=0.5), # randomly remove up to 10% of the pixels\n", + " ]),\n", + " iaa.Add((-10, 10), per_channel=0.5), # change brightness of images (by -10 to 10 of original value)\n", + " iaa.Multiply((0.5, 1.5), per_channel=0.5), # change brightness of images (50-150% of original value)\n", + " iaa.ContrastNormalization((0.5, 2.0), per_channel=0.5), # improve or worsen the contrast\n", + " ],\n", + " random_order=True\n", + " )\n", + " ],\n", + " random_order=True\n", + " )\n", + "\n", + " if shuffle: np.random.shuffle(self.images)\n", + "\n", + " def __len__(self):\n", + " return int(np.ceil(float(len(self.images))/self.config['BATCH_SIZE'])) \n", + "\n", + " def num_classes(self):\n", + " return len(self.config['LABELS'])\n", + "\n", + " def size(self):\n", + " return len(self.images) \n", + "\n", + " def load_annotation(self, i):\n", + " annots = []\n", + "\n", + " for obj in self.images[i]['object']:\n", + " annot = [obj['xmin'], obj['ymin'], obj['xmax'], obj['ymax'], self.config['LABELS'].index(obj['name'])]\n", + " annots += [annot]\n", + "\n", + " if len(annots) == 0: annots = [[]]\n", + "\n", + " return np.array(annots)\n", + "\n", + " def load_image(self, i):\n", + " return cv2.imread(self.images[i]['filename'])\n", + "\n", + " def __getitem__(self, idx):\n", + " l_bound = idx*self.config['BATCH_SIZE']\n", + " r_bound = (idx+1)*self.config['BATCH_SIZE']\n", + "\n", + " if r_bound > len(self.images):\n", + " r_bound = len(self.images)\n", + " l_bound = r_bound - self.config['BATCH_SIZE']\n", + "\n", + " instance_count = 0\n", + "\n", + " x_batch = np.zeros((r_bound - l_bound, self.config['IMAGE_H'], self.config['IMAGE_W'], 3)) # input images\n", + " b_batch = np.zeros((r_bound - l_bound, 1 , 1 , 1 , self.config['TRUE_BOX_BUFFER'], 4)) # list of self.config['TRUE_self.config['BOX']_BUFFER'] GT boxes\n", + " y_batch = np.zeros((r_bound - l_bound, self.config['GRID_H'], self.config['GRID_W'], self.config['BOX'], 4+1+len(self.config['LABELS']))) # desired network output\n", + "\n", + " for train_instance in self.images[l_bound:r_bound]:\n", + " # augment input image and fix object's position and size\n", + " img, all_objs = self.aug_image(train_instance, jitter=self.jitter)\n", + " \n", + " # construct output from object's x, y, w, h\n", + " true_box_index = 0\n", + " \n", + " for obj in all_objs:\n", + " if obj['xmax'] > obj['xmin'] and obj['ymax'] > obj['ymin'] and obj['name'] in self.config['LABELS']:\n", + " center_x = .5*(obj['xmin'] + obj['xmax'])\n", + " center_x = center_x / (float(self.config['IMAGE_W']) / self.config['GRID_W'])\n", + " center_y = .5*(obj['ymin'] + obj['ymax'])\n", + " center_y = center_y / (float(self.config['IMAGE_H']) / self.config['GRID_H'])\n", + "\n", + " grid_x = int(np.floor(center_x))\n", + " grid_y = int(np.floor(center_y))\n", + "\n", + " if grid_x < self.config['GRID_W'] and grid_y < self.config['GRID_H']:\n", + " obj_indx = self.config['LABELS'].index(obj['name'])\n", + " \n", + " center_w = (obj['xmax'] - obj['xmin']) / (float(self.config['IMAGE_W']) / self.config['GRID_W']) # unit: grid cell\n", + " center_h = (obj['ymax'] - obj['ymin']) / (float(self.config['IMAGE_H']) / self.config['GRID_H']) # unit: grid cell\n", + " \n", + " box = [center_x, center_y, center_w, center_h]\n", + "\n", + " # find the anchor that best predicts this box\n", + " best_anchor = -1\n", + " max_iou = -1\n", + " \n", + " shifted_box = BoundBox(0, \n", + " 0,\n", + " center_w, \n", + " center_h)\n", + " \n", + " for i in range(len(self.anchors)):\n", + " anchor = self.anchors[i]\n", + " iou = bbox_iou(shifted_box, anchor)\n", + " \n", + " if max_iou < iou:\n", + " best_anchor = i\n", + " max_iou = iou\n", + " \n", + " # assign ground truth x, y, w, h, confidence and class probs to y_batch\n", + " y_batch[instance_count, grid_y, grid_x, best_anchor, 0:4] = box\n", + " y_batch[instance_count, grid_y, grid_x, best_anchor, 4 ] = 1.\n", + " y_batch[instance_count, grid_y, grid_x, best_anchor, 5+obj_indx] = 1\n", + " \n", + " # assign the true box to b_batch\n", + " b_batch[instance_count, 0, 0, 0, true_box_index] = box\n", + " \n", + " true_box_index += 1\n", + " true_box_index = true_box_index % self.config['TRUE_BOX_BUFFER']\n", + " \n", + " # assign input image to x_batch\n", + " if self.norm != None: \n", + " x_batch[instance_count] = self.norm(img)\n", + " else:\n", + " # plot image and bounding boxes for sanity check\n", + " for obj in all_objs:\n", + " if obj['xmax'] > obj['xmin'] and obj['ymax'] > obj['ymin']:\n", + " cv2.rectangle(img[:,:,::-1], (obj['xmin'],obj['ymin']), (obj['xmax'],obj['ymax']), (255,0,0), 3)\n", + " cv2.putText(img[:,:,::-1], obj['name'], \n", + " (obj['xmin']+2, obj['ymin']+12), \n", + " 0, 1.2e-3 * img.shape[0], \n", + " (0,255,0), 2)\n", + " \n", + " x_batch[instance_count] = img\n", + "\n", + " # increase instance counter in current batch\n", + " instance_count += 1 \n", + "\n", + " #print(' new batch created', idx)\n", + "\n", + " return [x_batch, b_batch], y_batch\n", + "\n", + " def on_epoch_end(self):\n", + " if self.shuffle: np.random.shuffle(self.images)\n", + "\n", + " def aug_image(self, train_instance, jitter):\n", + " image_name = train_instance['filename']\n", + " image = cv2.imread(image_name)\n", + "\n", + " if image is None: print('Cannot find ', image_name)\n", + "\n", + " h, w, c = image.shape\n", + " all_objs = copy.deepcopy(train_instance['object'])\n", + "\n", + " if jitter:\n", + " ### scale the image\n", + " scale = np.random.uniform() / 10. + 1.\n", + " image = cv2.resize(image, (0,0), fx = scale, fy = scale)\n", + "\n", + " ### translate the image\n", + " max_offx = (scale-1.) * w\n", + " max_offy = (scale-1.) * h\n", + " offx = int(np.random.uniform() * max_offx)\n", + " offy = int(np.random.uniform() * max_offy)\n", + " \n", + " image = image[offy : (offy + h), offx : (offx + w)]\n", + "\n", + " ### flip the image\n", + " flip = np.random.binomial(1, .5)\n", + " if flip > 0.5: image = cv2.flip(image, 1)\n", + " \n", + " image = self.aug_pipe.augment_image(image) \n", + " \n", + " # resize the image to standard size\n", + " image = cv2.resize(image, (self.config['IMAGE_H'], self.config['IMAGE_W']))\n", + " image = image[:,:,::-1]\n", + "\n", + " # fix object's position and size\n", + " for obj in all_objs:\n", + " for attr in ['xmin', 'xmax']:\n", + " if jitter: obj[attr] = int(obj[attr] * scale - offx)\n", + " \n", + " obj[attr] = int(obj[attr] * float(self.config['IMAGE_W']) / w)\n", + " obj[attr] = max(min(obj[attr], self.config['IMAGE_W']), 0)\n", + " \n", + " for attr in ['ymin', 'ymax']:\n", + " if jitter: obj[attr] = int(obj[attr] * scale - offy)\n", + " \n", + " obj[attr] = int(obj[attr] * float(self.config['IMAGE_H']) / h)\n", + " obj[attr] = max(min(obj[attr], self.config['IMAGE_H']), 0)\n", + "\n", + " if jitter and flip > 0.5:\n", + " xmin = obj['xmin']\n", + " obj['xmin'] = self.config['IMAGE_W'] - obj['xmax']\n", + " obj['xmax'] = self.config['IMAGE_W'] - xmin\n", + " \n", + " return image, all_objs\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Crear el Modelo YOLOv2" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Descarga el archivo con los pesos de la red full_yolo_backend.h5 desde https://drive.google.com/file/d/1Q9WhhRlqQbA4jgBkCDrynvgquRXZA_f8/view?usp=sharing\n", + "y copialos en este mismo directorio" + ] + }, + { + "cell_type": "code", + "execution_count": 45, + "metadata": { + "ExecuteTime": { + "end_time": "2020-06-18T21:18:08.294971Z", + "start_time": "2020-06-18T21:18:08.232388Z" + } + }, + "outputs": [], + "source": [ + "from keras.models import Model\n", + "import tensorflow as tf\n", + "from keras.layers import Reshape, Activation, Conv2D, Input, MaxPooling2D, BatchNormalization, Flatten, Dense, Lambda\n", + "from keras.layers.advanced_activations import LeakyReLU\n", + "from keras.layers.merge import concatenate\n", + "from keras.optimizers import SGD, Adam, RMSprop\n", + "from keras.callbacks import EarlyStopping, ModelCheckpoint, TensorBoard\n", + "\n", + "\n", + "FULL_YOLO_BACKEND_PATH = \"full_yolo_backend.h5\" # should be hosted on a server\n", + "\n", + "class BaseFeatureExtractor(object):\n", + " \"\"\"docstring for ClassName\"\"\"\n", + "\n", + " # to be defined in each subclass\n", + " def __init__(self, input_size):\n", + " raise NotImplementedError(\"error message\")\n", + "\n", + " # to be defined in each subclass\n", + " def normalize(self, image):\n", + " raise NotImplementedError(\"error message\") \n", + "\n", + " def get_output_shape(self):\n", + " return self.feature_extractor.get_output_shape_at(-1)[1:3]\n", + "\n", + " def extract(self, input_image):\n", + " return self.feature_extractor(input_image)\n", + "\n", + "class FullYoloFeature(BaseFeatureExtractor):\n", + " \"\"\"docstring for ClassName\"\"\"\n", + " def __init__(self, input_size):\n", + " input_image = Input(shape=(input_size, input_size, 3))\n", + "\n", + " # the function to implement the orgnization layer (thanks to github.com/allanzelener/YAD2K)\n", + " def space_to_depth_x2(x):\n", + " return tf.space_to_depth(x, block_size=2)\n", + "\n", + " # Layer 1\n", + " x = Conv2D(32, (3,3), strides=(1,1), padding='same', name='conv_1', use_bias=False)(input_image)\n", + " x = BatchNormalization(name='norm_1')(x)\n", + " x = LeakyReLU(alpha=0.1)(x)\n", + " x = MaxPooling2D(pool_size=(2, 2))(x)\n", + "\n", + " # Layer 2\n", + " x = Conv2D(64, (3,3), strides=(1,1), padding='same', name='conv_2', use_bias=False)(x)\n", + " x = BatchNormalization(name='norm_2')(x)\n", + " x = LeakyReLU(alpha=0.1)(x)\n", + " x = MaxPooling2D(pool_size=(2, 2))(x)\n", + "\n", + " # Layer 3\n", + " x = Conv2D(128, (3,3), strides=(1,1), padding='same', name='conv_3', use_bias=False)(x)\n", + " x = BatchNormalization(name='norm_3')(x)\n", + " x = LeakyReLU(alpha=0.1)(x)\n", + "\n", + " # Layer 4\n", + " x = Conv2D(64, (1,1), strides=(1,1), padding='same', name='conv_4', use_bias=False)(x)\n", + " x = BatchNormalization(name='norm_4')(x)\n", + " x = LeakyReLU(alpha=0.1)(x)\n", + "\n", + " # Layer 5\n", + " x = Conv2D(128, (3,3), strides=(1,1), padding='same', name='conv_5', use_bias=False)(x)\n", + " x = BatchNormalization(name='norm_5')(x)\n", + " x = LeakyReLU(alpha=0.1)(x)\n", + " x = MaxPooling2D(pool_size=(2, 2))(x)\n", + "\n", + " # Layer 6\n", + " x = Conv2D(256, (3,3), strides=(1,1), padding='same', name='conv_6', use_bias=False)(x)\n", + " x = BatchNormalization(name='norm_6')(x)\n", + " x = LeakyReLU(alpha=0.1)(x)\n", + "\n", + " # Layer 7\n", + " x = Conv2D(128, (1,1), strides=(1,1), padding='same', name='conv_7', use_bias=False)(x)\n", + " x = BatchNormalization(name='norm_7')(x)\n", + " x = LeakyReLU(alpha=0.1)(x)\n", + "\n", + " # Layer 8\n", + " x = Conv2D(256, (3,3), strides=(1,1), padding='same', name='conv_8', use_bias=False)(x)\n", + " x = BatchNormalization(name='norm_8')(x)\n", + " x = LeakyReLU(alpha=0.1)(x)\n", + " x = MaxPooling2D(pool_size=(2, 2))(x)\n", + "\n", + " # Layer 9\n", + " x = Conv2D(512, (3,3), strides=(1,1), padding='same', name='conv_9', use_bias=False)(x)\n", + " x = BatchNormalization(name='norm_9')(x)\n", + " x = LeakyReLU(alpha=0.1)(x)\n", + "\n", + " # Layer 10\n", + " x = Conv2D(256, (1,1), strides=(1,1), padding='same', name='conv_10', use_bias=False)(x)\n", + " x = BatchNormalization(name='norm_10')(x)\n", + " x = LeakyReLU(alpha=0.1)(x)\n", + "\n", + " # Layer 11\n", + " x = Conv2D(512, (3,3), strides=(1,1), padding='same', name='conv_11', use_bias=False)(x)\n", + " x = BatchNormalization(name='norm_11')(x)\n", + " x = LeakyReLU(alpha=0.1)(x)\n", + "\n", + " # Layer 12\n", + " x = Conv2D(256, (1,1), strides=(1,1), padding='same', name='conv_12', use_bias=False)(x)\n", + " x = BatchNormalization(name='norm_12')(x)\n", + " x = LeakyReLU(alpha=0.1)(x)\n", + "\n", + " # Layer 13\n", + " x = Conv2D(512, (3,3), strides=(1,1), padding='same', name='conv_13', use_bias=False)(x)\n", + " x = BatchNormalization(name='norm_13')(x)\n", + " x = LeakyReLU(alpha=0.1)(x)\n", + "\n", + " skip_connection = x\n", + "\n", + " x = MaxPooling2D(pool_size=(2, 2))(x)\n", + "\n", + " # Layer 14\n", + " x = Conv2D(1024, (3,3), strides=(1,1), padding='same', name='conv_14', use_bias=False)(x)\n", + " x = BatchNormalization(name='norm_14')(x)\n", + " x = LeakyReLU(alpha=0.1)(x)\n", + "\n", + " # Layer 15\n", + " x = Conv2D(512, (1,1), strides=(1,1), padding='same', name='conv_15', use_bias=False)(x)\n", + " x = BatchNormalization(name='norm_15')(x)\n", + " x = LeakyReLU(alpha=0.1)(x)\n", + "\n", + " # Layer 16\n", + " x = Conv2D(1024, (3,3), strides=(1,1), padding='same', name='conv_16', use_bias=False)(x)\n", + " x = BatchNormalization(name='norm_16')(x)\n", + " x = LeakyReLU(alpha=0.1)(x)\n", + "\n", + " # Layer 17\n", + " x = Conv2D(512, (1,1), strides=(1,1), padding='same', name='conv_17', use_bias=False)(x)\n", + " x = BatchNormalization(name='norm_17')(x)\n", + " x = LeakyReLU(alpha=0.1)(x)\n", + "\n", + " # Layer 18\n", + " x = Conv2D(1024, (3,3), strides=(1,1), padding='same', name='conv_18', use_bias=False)(x)\n", + " x = BatchNormalization(name='norm_18')(x)\n", + " x = LeakyReLU(alpha=0.1)(x)\n", + "\n", + " # Layer 19\n", + " x = Conv2D(1024, (3,3), strides=(1,1), padding='same', name='conv_19', use_bias=False)(x)\n", + " x = BatchNormalization(name='norm_19')(x)\n", + " x = LeakyReLU(alpha=0.1)(x)\n", + "\n", + " # Layer 20\n", + " x = Conv2D(1024, (3,3), strides=(1,1), padding='same', name='conv_20', use_bias=False)(x)\n", + " x = BatchNormalization(name='norm_20')(x)\n", + " x = LeakyReLU(alpha=0.1)(x)\n", + "\n", + " # Layer 21\n", + " skip_connection = Conv2D(64, (1,1), strides=(1,1), padding='same', name='conv_21', use_bias=False)(skip_connection)\n", + " skip_connection = BatchNormalization(name='norm_21')(skip_connection)\n", + " skip_connection = LeakyReLU(alpha=0.1)(skip_connection)\n", + " skip_connection = Lambda(space_to_depth_x2)(skip_connection)\n", + "\n", + " x = concatenate([skip_connection, x])\n", + "\n", + " # Layer 22\n", + " x = Conv2D(1024, (3,3), strides=(1,1), padding='same', name='conv_22', use_bias=False)(x)\n", + " x = BatchNormalization(name='norm_22')(x)\n", + " x = LeakyReLU(alpha=0.1)(x)\n", + "\n", + " self.feature_extractor = Model(input_image, x)\n", + " self.feature_extractor.load_weights(FULL_YOLO_BACKEND_PATH)\n", + "\n", + " def normalize(self, image):\n", + " return image / 255.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 46, + "metadata": { + "ExecuteTime": { + "end_time": "2020-06-18T21:18:09.266689Z", + "start_time": "2020-06-18T21:18:09.236772Z" + } + }, + "outputs": [], + "source": [ + "# funciones que necesitaremos\n", + "\n", + "def _sigmoid(x):\n", + " return 1. / (1. + np.exp(-x))\n", + "\n", + "def _softmax(x, axis=-1, t=-100.):\n", + " x = x - np.max(x)\n", + " \n", + " if np.min(x) < t:\n", + " x = x/np.min(x)*t\n", + " \n", + " e_x = np.exp(x)\n", + " \n", + " return e_x / e_x.sum(axis, keepdims=True)\n", + "\n", + "def _interval_overlap(interval_a, interval_b):\n", + " x1, x2 = interval_a\n", + " x3, x4 = interval_b\n", + "\n", + " if x3 < x1:\n", + " if x4 < x1:\n", + " return 0\n", + " else:\n", + " return min(x2,x4) - x1\n", + " else:\n", + " if x2 < x3:\n", + " return 0\n", + " else:\n", + " return min(x2,x4) - x3 \n", + "\n", + "def compute_overlap(a, b):\n", + " \"\"\"\n", + " Code originally from https://github.com/rbgirshick/py-faster-rcnn.\n", + " Parameters\n", + " ----------\n", + " a: (N, 4) ndarray of float\n", + " b: (K, 4) ndarray of float\n", + " Returns\n", + " -------\n", + " overlaps: (N, K) ndarray of overlap between boxes and query_boxes\n", + " \"\"\"\n", + " area = (b[:, 2] - b[:, 0]) * (b[:, 3] - b[:, 1])\n", + "\n", + " iw = np.minimum(np.expand_dims(a[:, 2], axis=1), b[:, 2]) - np.maximum(np.expand_dims(a[:, 0], 1), b[:, 0])\n", + " ih = np.minimum(np.expand_dims(a[:, 3], axis=1), b[:, 3]) - np.maximum(np.expand_dims(a[:, 1], 1), b[:, 1])\n", + "\n", + " iw = np.maximum(iw, 0)\n", + " ih = np.maximum(ih, 0)\n", + "\n", + " ua = np.expand_dims((a[:, 2] - a[:, 0]) * (a[:, 3] - a[:, 1]), axis=1) + area - iw * ih\n", + "\n", + " ua = np.maximum(ua, np.finfo(float).eps)\n", + "\n", + " intersection = iw * ih\n", + "\n", + " return intersection / ua \n", + " \n", + "def compute_ap(recall, precision):\n", + " \"\"\" Compute the average precision, given the recall and precision curves.\n", + " Code originally from https://github.com/rbgirshick/py-faster-rcnn.\n", + "\n", + " # Arguments\n", + " recall: The recall curve (list).\n", + " precision: The precision curve (list).\n", + " # Returns\n", + " The average precision as computed in py-faster-rcnn.\n", + " \"\"\"\n", + " # correct AP calculation\n", + " # first append sentinel values at the end\n", + " mrec = np.concatenate(([0.], recall, [1.]))\n", + " mpre = np.concatenate(([0.], precision, [0.]))\n", + "\n", + " # compute the precision envelope\n", + " for i in range(mpre.size - 1, 0, -1):\n", + " mpre[i - 1] = np.maximum(mpre[i - 1], mpre[i])\n", + "\n", + " # to calculate area under PR curve, look for points\n", + " # where X axis (recall) changes value\n", + " i = np.where(mrec[1:] != mrec[:-1])[0]\n", + "\n", + " # and sum (\\Delta recall) * prec\n", + " ap = np.sum((mrec[i + 1] - mrec[i]) * mpre[i + 1])\n", + " return ap \n", + "\n", + "def decode_netout(netout, anchors, nb_class, obj_threshold=0.3, nms_threshold=0.3):\n", + " grid_h, grid_w, nb_box = netout.shape[:3]\n", + "\n", + " boxes = []\n", + " \n", + " # decode the output by the network\n", + " netout[..., 4] = _sigmoid(netout[..., 4])\n", + " netout[..., 5:] = netout[..., 4][..., np.newaxis] * _softmax(netout[..., 5:])\n", + " netout[..., 5:] *= netout[..., 5:] > obj_threshold\n", + " \n", + " for row in range(grid_h):\n", + " for col in range(grid_w):\n", + " for b in range(nb_box):\n", + " # from 4th element onwards are confidence and class classes\n", + " classes = netout[row,col,b,5:]\n", + " \n", + " if np.sum(classes) > 0:\n", + " # first 4 elements are x, y, w, and h\n", + " x, y, w, h = netout[row,col,b,:4]\n", + "\n", + " x = (col + _sigmoid(x)) / grid_w # center position, unit: image width\n", + " y = (row + _sigmoid(y)) / grid_h # center position, unit: image height\n", + " w = anchors[2 * b + 0] * np.exp(w) / grid_w # unit: image width\n", + " h = anchors[2 * b + 1] * np.exp(h) / grid_h # unit: image height\n", + " confidence = netout[row,col,b,4]\n", + " \n", + " box = BoundBox(x-w/2, y-h/2, x+w/2, y+h/2, confidence, classes)\n", + " \n", + " boxes.append(box)\n", + "\n", + " # suppress non-maximal boxes\n", + " for c in range(nb_class):\n", + " sorted_indices = list(reversed(np.argsort([box.classes[c] for box in boxes])))\n", + "\n", + " for i in range(len(sorted_indices)):\n", + " index_i = sorted_indices[i]\n", + " \n", + " if boxes[index_i].classes[c] == 0: \n", + " continue\n", + " else:\n", + " for j in range(i+1, len(sorted_indices)):\n", + " index_j = sorted_indices[j]\n", + " \n", + " if bbox_iou(boxes[index_i], boxes[index_j]) >= nms_threshold:\n", + " boxes[index_j].classes[c] = 0\n", + " \n", + " # remove the boxes which are less likely than a obj_threshold\n", + " boxes = [box for box in boxes if box.get_score() > obj_threshold]\n", + " \n", + " return boxes " + ] + }, + { + "cell_type": "code", + "execution_count": 47, + "metadata": { + "ExecuteTime": { + "end_time": "2020-06-18T21:18:10.604188Z", + "start_time": "2020-06-18T21:18:10.503708Z" + } + }, + "outputs": [], + "source": [ + "\n", + "class YOLO(object):\n", + " def __init__(self, \n", + " input_size, \n", + " labels, \n", + " max_box_per_image,\n", + " anchors):\n", + "\n", + " self.input_size = input_size\n", + " \n", + " self.labels = list(labels)\n", + " self.nb_class = len(self.labels)\n", + " self.nb_box = len(anchors)//2\n", + " self.class_wt = np.ones(self.nb_class, dtype='float32')\n", + " self.anchors = anchors\n", + "\n", + " self.max_box_per_image = max_box_per_image\n", + "\n", + " ##########################\n", + " # Make the model\n", + " ##########################\n", + "\n", + " # make the feature extractor layers\n", + " input_image = Input(shape=(self.input_size, self.input_size, 3))\n", + " self.true_boxes = Input(shape=(1, 1, 1, max_box_per_image , 4)) \n", + "\n", + " self.feature_extractor = FullYoloFeature(self.input_size)\n", + "\n", + " print(self.feature_extractor.get_output_shape()) \n", + " self.grid_h, self.grid_w = self.feature_extractor.get_output_shape() \n", + " features = self.feature_extractor.extract(input_image) \n", + "\n", + " # make the object detection layer\n", + " output = Conv2D(self.nb_box * (4 + 1 + self.nb_class), \n", + " (1,1), strides=(1,1), \n", + " padding='same', \n", + " name='DetectionLayer', \n", + " kernel_initializer='lecun_normal')(features)\n", + " output = Reshape((self.grid_h, self.grid_w, self.nb_box, 4 + 1 + self.nb_class))(output)\n", + " output = Lambda(lambda args: args[0])([output, self.true_boxes])\n", + "\n", + " self.model = Model([input_image, self.true_boxes], output)\n", + "\n", + " \n", + " # initialize the weights of the detection layer\n", + " layer = self.model.layers[-4]\n", + " weights = layer.get_weights()\n", + "\n", + " new_kernel = np.random.normal(size=weights[0].shape)/(self.grid_h*self.grid_w)\n", + " new_bias = np.random.normal(size=weights[1].shape)/(self.grid_h*self.grid_w)\n", + "\n", + " layer.set_weights([new_kernel, new_bias])\n", + "\n", + " # print a summary of the whole model\n", + " self.model.summary()\n", + "\n", + " def custom_loss(self, y_true, y_pred):\n", + " mask_shape = tf.shape(y_true)[:4]\n", + " \n", + " cell_x = tf.to_float(tf.reshape(tf.tile(tf.range(self.grid_w), [self.grid_h]), (1, self.grid_h, self.grid_w, 1, 1)))\n", + " cell_y = tf.transpose(cell_x, (0,2,1,3,4))\n", + "\n", + " cell_grid = tf.tile(tf.concat([cell_x,cell_y], -1), [self.batch_size, 1, 1, self.nb_box, 1])\n", + " \n", + " coord_mask = tf.zeros(mask_shape)\n", + " conf_mask = tf.zeros(mask_shape)\n", + " class_mask = tf.zeros(mask_shape)\n", + " \n", + " seen = tf.Variable(0.)\n", + " total_recall = tf.Variable(0.)\n", + " \n", + " \"\"\"\n", + " Adjust prediction\n", + " \"\"\"\n", + " ### adjust x and y \n", + " pred_box_xy = tf.sigmoid(y_pred[..., :2]) + cell_grid\n", + " \n", + " ### adjust w and h\n", + " pred_box_wh = tf.exp(y_pred[..., 2:4]) * np.reshape(self.anchors, [1,1,1,self.nb_box,2])\n", + " \n", + " ### adjust confidence\n", + " pred_box_conf = tf.sigmoid(y_pred[..., 4])\n", + " \n", + " ### adjust class probabilities\n", + " pred_box_class = y_pred[..., 5:]\n", + " \n", + " \"\"\"\n", + " Adjust ground truth\n", + " \"\"\"\n", + " ### adjust x and y\n", + " true_box_xy = y_true[..., 0:2] # relative position to the containing cell\n", + " \n", + " ### adjust w and h\n", + " true_box_wh = y_true[..., 2:4] # number of cells accross, horizontally and vertically\n", + " \n", + " ### adjust confidence\n", + " true_wh_half = true_box_wh / 2.\n", + " true_mins = true_box_xy - true_wh_half\n", + " true_maxes = true_box_xy + true_wh_half\n", + " \n", + " pred_wh_half = pred_box_wh / 2.\n", + " pred_mins = pred_box_xy - pred_wh_half\n", + " pred_maxes = pred_box_xy + pred_wh_half \n", + " \n", + " intersect_mins = tf.maximum(pred_mins, true_mins)\n", + " intersect_maxes = tf.minimum(pred_maxes, true_maxes)\n", + " intersect_wh = tf.maximum(intersect_maxes - intersect_mins, 0.)\n", + " intersect_areas = intersect_wh[..., 0] * intersect_wh[..., 1]\n", + " \n", + " true_areas = true_box_wh[..., 0] * true_box_wh[..., 1]\n", + " pred_areas = pred_box_wh[..., 0] * pred_box_wh[..., 1]\n", + "\n", + " union_areas = pred_areas + true_areas - intersect_areas\n", + " iou_scores = tf.truediv(intersect_areas, union_areas)\n", + " \n", + " true_box_conf = iou_scores * y_true[..., 4]\n", + " \n", + " ### adjust class probabilities\n", + " true_box_class = tf.argmax(y_true[..., 5:], -1)\n", + " \n", + " \"\"\"\n", + " Determine the masks\n", + " \"\"\"\n", + " ### coordinate mask: simply the position of the ground truth boxes (the predictors)\n", + " coord_mask = tf.expand_dims(y_true[..., 4], axis=-1) * self.coord_scale\n", + " \n", + " ### confidence mask: penelize predictors + penalize boxes with low IOU\n", + " # penalize the confidence of the boxes, which have IOU with some ground truth box < 0.6\n", + " true_xy = self.true_boxes[..., 0:2]\n", + " true_wh = self.true_boxes[..., 2:4]\n", + " \n", + " true_wh_half = true_wh / 2.\n", + " true_mins = true_xy - true_wh_half\n", + " true_maxes = true_xy + true_wh_half\n", + " \n", + " pred_xy = tf.expand_dims(pred_box_xy, 4)\n", + " pred_wh = tf.expand_dims(pred_box_wh, 4)\n", + " \n", + " pred_wh_half = pred_wh / 2.\n", + " pred_mins = pred_xy - pred_wh_half\n", + " pred_maxes = pred_xy + pred_wh_half \n", + " \n", + " intersect_mins = tf.maximum(pred_mins, true_mins)\n", + " intersect_maxes = tf.minimum(pred_maxes, true_maxes)\n", + " intersect_wh = tf.maximum(intersect_maxes - intersect_mins, 0.)\n", + " intersect_areas = intersect_wh[..., 0] * intersect_wh[..., 1]\n", + " \n", + " true_areas = true_wh[..., 0] * true_wh[..., 1]\n", + " pred_areas = pred_wh[..., 0] * pred_wh[..., 1]\n", + "\n", + " union_areas = pred_areas + true_areas - intersect_areas\n", + " iou_scores = tf.truediv(intersect_areas, union_areas)\n", + "\n", + " best_ious = tf.reduce_max(iou_scores, axis=4)\n", + " conf_mask = conf_mask + tf.to_float(best_ious < 0.6) * (1 - y_true[..., 4]) * self.no_object_scale\n", + " \n", + " # penalize the confidence of the boxes, which are reponsible for corresponding ground truth box\n", + " conf_mask = conf_mask + y_true[..., 4] * self.object_scale\n", + " \n", + " ### class mask: simply the position of the ground truth boxes (the predictors)\n", + " class_mask = y_true[..., 4] * tf.gather(self.class_wt, true_box_class) * self.class_scale \n", + " \n", + " \"\"\"\n", + " Warm-up training\n", + " \"\"\"\n", + " no_boxes_mask = tf.to_float(coord_mask < self.coord_scale/2.)\n", + " seen = tf.assign_add(seen, 1.)\n", + " \n", + " true_box_xy, true_box_wh, coord_mask = tf.cond(tf.less(seen, self.warmup_batches+1), \n", + " lambda: [true_box_xy + (0.5 + cell_grid) * no_boxes_mask, \n", + " true_box_wh + tf.ones_like(true_box_wh) * \\\n", + " np.reshape(self.anchors, [1,1,1,self.nb_box,2]) * \\\n", + " no_boxes_mask, \n", + " tf.ones_like(coord_mask)],\n", + " lambda: [true_box_xy, \n", + " true_box_wh,\n", + " coord_mask])\n", + " \n", + " \"\"\"\n", + " Finalize the loss\n", + " \"\"\"\n", + " nb_coord_box = tf.reduce_sum(tf.to_float(coord_mask > 0.0))\n", + " nb_conf_box = tf.reduce_sum(tf.to_float(conf_mask > 0.0))\n", + " nb_class_box = tf.reduce_sum(tf.to_float(class_mask > 0.0))\n", + " \n", + " loss_xy = tf.reduce_sum(tf.square(true_box_xy-pred_box_xy) * coord_mask) / (nb_coord_box + 1e-6) / 2.\n", + " loss_wh = tf.reduce_sum(tf.square(true_box_wh-pred_box_wh) * coord_mask) / (nb_coord_box + 1e-6) / 2.\n", + " loss_conf = tf.reduce_sum(tf.square(true_box_conf-pred_box_conf) * conf_mask) / (nb_conf_box + 1e-6) / 2.\n", + " loss_class = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=true_box_class, logits=pred_box_class)\n", + " loss_class = tf.reduce_sum(loss_class * class_mask) / (nb_class_box + 1e-6)\n", + " \n", + " loss = tf.cond(tf.less(seen, self.warmup_batches+1), \n", + " lambda: loss_xy + loss_wh + loss_conf + loss_class + 10,\n", + " lambda: loss_xy + loss_wh + loss_conf + loss_class)\n", + " \n", + " if self.debug:\n", + " nb_true_box = tf.reduce_sum(y_true[..., 4])\n", + " nb_pred_box = tf.reduce_sum(tf.to_float(true_box_conf > 0.5) * tf.to_float(pred_box_conf > 0.3))\n", + " \n", + " current_recall = nb_pred_box/(nb_true_box + 1e-6)\n", + " total_recall = tf.assign_add(total_recall, current_recall) \n", + "\n", + " loss = tf.Print(loss, [loss_xy], message='Loss XY \\t', summarize=1000)\n", + " loss = tf.Print(loss, [loss_wh], message='Loss WH \\t', summarize=1000)\n", + " loss = tf.Print(loss, [loss_conf], message='Loss Conf \\t', summarize=1000)\n", + " loss = tf.Print(loss, [loss_class], message='Loss Class \\t', summarize=1000)\n", + " loss = tf.Print(loss, [loss], message='Total Loss \\t', summarize=1000)\n", + " loss = tf.Print(loss, [current_recall], message='Current Recall \\t', summarize=1000)\n", + " loss = tf.Print(loss, [total_recall/seen], message='Average Recall \\t', summarize=1000)\n", + " \n", + " return loss\n", + "\n", + " def load_weights(self, weight_path):\n", + " self.model.load_weights(weight_path)\n", + "\n", + " def train(self, train_imgs, # the list of images to train the model\n", + " valid_imgs, # the list of images used to validate the model\n", + " train_times, # the number of time to repeat the training set, often used for small datasets\n", + " valid_times, # the number of times to repeat the validation set, often used for small datasets\n", + " nb_epochs, # number of epoches\n", + " learning_rate, # the learning rate\n", + " batch_size, # the size of the batch\n", + " warmup_epochs, # number of initial batches to let the model familiarize with the new dataset\n", + " object_scale,\n", + " no_object_scale,\n", + " coord_scale,\n", + " class_scale,\n", + " saved_weights_name='best_weights.h5',\n", + " debug=False): \n", + "\n", + " self.batch_size = batch_size\n", + "\n", + " self.object_scale = object_scale\n", + " self.no_object_scale = no_object_scale\n", + " self.coord_scale = coord_scale\n", + " self.class_scale = class_scale\n", + "\n", + " self.debug = debug\n", + "\n", + " ############################################\n", + " # Make train and validation generators\n", + " ############################################\n", + "\n", + " generator_config = {\n", + " 'IMAGE_H' : self.input_size, \n", + " 'IMAGE_W' : self.input_size,\n", + " 'GRID_H' : self.grid_h, \n", + " 'GRID_W' : self.grid_w,\n", + " 'BOX' : self.nb_box,\n", + " 'LABELS' : self.labels,\n", + " 'CLASS' : len(self.labels),\n", + " 'ANCHORS' : self.anchors,\n", + " 'BATCH_SIZE' : self.batch_size,\n", + " 'TRUE_BOX_BUFFER' : self.max_box_per_image,\n", + " } \n", + "\n", + " train_generator = BatchGenerator(train_imgs, \n", + " generator_config, \n", + " norm=self.feature_extractor.normalize)\n", + " valid_generator = BatchGenerator(valid_imgs, \n", + " generator_config, \n", + " norm=self.feature_extractor.normalize,\n", + " jitter=False) \n", + " \n", + " self.warmup_batches = warmup_epochs * (train_times*len(train_generator) + valid_times*len(valid_generator)) \n", + "\n", + " ############################################\n", + " # Compile the model\n", + " ############################################\n", + "\n", + " optimizer = Adam(lr=learning_rate, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0)\n", + " self.model.compile(loss=self.custom_loss, optimizer=optimizer)\n", + "\n", + " ############################################\n", + " # Make a few callbacks\n", + " ############################################\n", + "\n", + " early_stop = EarlyStopping(monitor='val_loss', \n", + " min_delta=0.001, \n", + " patience=3, \n", + " mode='min', \n", + " verbose=1)\n", + " checkpoint = ModelCheckpoint(saved_weights_name, \n", + " monitor='val_loss', \n", + " verbose=1, \n", + " save_best_only=True, \n", + " mode='min', \n", + " period=1)\n", + " tensorboard = TensorBoard(log_dir=os.path.expanduser('~/logs/'), \n", + " histogram_freq=0, \n", + " #write_batch_performance=True,\n", + " write_graph=True, \n", + " write_images=False)\n", + "\n", + " ############################################\n", + " # Start the training process\n", + " ############################################ \n", + "\n", + " self.model.fit_generator(generator = train_generator, \n", + " steps_per_epoch = len(train_generator) * train_times, \n", + " epochs = warmup_epochs + nb_epochs, \n", + " verbose = 2 if debug else 1,\n", + " validation_data = valid_generator,\n", + " validation_steps = len(valid_generator) * valid_times,\n", + " callbacks = [early_stop, checkpoint, tensorboard], \n", + " workers = 3,\n", + " max_queue_size = 8)\n", + "\n", + " ############################################\n", + " # Compute mAP on the validation set\n", + " ############################################\n", + " average_precisions = self.evaluate(valid_generator) \n", + "\n", + " # print evaluation\n", + " for label, average_precision in average_precisions.items():\n", + " print(self.labels[label], '{:.4f}'.format(average_precision))\n", + " print('mAP: {:.4f}'.format(sum(average_precisions.values()) / len(average_precisions))) \n", + "\n", + " def evaluate(self, \n", + " generator, \n", + " iou_threshold=0.3,\n", + " score_threshold=0.3,\n", + " max_detections=100,\n", + " save_path=None):\n", + " \"\"\" Evaluate a given dataset using a given model.\n", + " code originally from https://github.com/fizyr/keras-retinanet\n", + "\n", + " # Arguments\n", + " generator : The generator that represents the dataset to evaluate.\n", + " model : The model to evaluate.\n", + " iou_threshold : The threshold used to consider when a detection is positive or negative.\n", + " score_threshold : The score confidence threshold to use for detections.\n", + " max_detections : The maximum number of detections to use per image.\n", + " save_path : The path to save images with visualized detections to.\n", + " # Returns\n", + " A dict mapping class names to mAP scores.\n", + " \"\"\" \n", + " # gather all detections and annotations\n", + " all_detections = [[None for i in range(generator.num_classes())] for j in range(generator.size())]\n", + " all_annotations = [[None for i in range(generator.num_classes())] for j in range(generator.size())]\n", + "\n", + " for i in range(generator.size()):\n", + " raw_image = generator.load_image(i)\n", + " raw_height, raw_width, raw_channels = raw_image.shape\n", + "\n", + " # make the boxes and the labels\n", + " pred_boxes = self.predict(raw_image)\n", + "\n", + " \n", + " score = np.array([box.score for box in pred_boxes])\n", + " pred_labels = np.array([box.label for box in pred_boxes]) \n", + " \n", + " if len(pred_boxes) > 0:\n", + " pred_boxes = np.array([[box.xmin*raw_width, box.ymin*raw_height, box.xmax*raw_width, box.ymax*raw_height, box.score] for box in pred_boxes])\n", + " else:\n", + " pred_boxes = np.array([[]]) \n", + " \n", + " # sort the boxes and the labels according to scores\n", + " score_sort = np.argsort(-score)\n", + " pred_labels = pred_labels[score_sort]\n", + " pred_boxes = pred_boxes[score_sort]\n", + " \n", + " # copy detections to all_detections\n", + " for label in range(generator.num_classes()):\n", + " all_detections[i][label] = pred_boxes[pred_labels == label, :]\n", + " \n", + " annotations = generator.load_annotation(i)\n", + " \n", + " # copy detections to all_annotations\n", + " for label in range(generator.num_classes()):\n", + " all_annotations[i][label] = annotations[annotations[:, 4] == label, :4].copy()\n", + " \n", + " # compute mAP by comparing all detections and all annotations\n", + " average_precisions = {}\n", + " \n", + " for label in range(generator.num_classes()):\n", + " false_positives = np.zeros((0,))\n", + " true_positives = np.zeros((0,))\n", + " scores = np.zeros((0,))\n", + " num_annotations = 0.0\n", + "\n", + " for i in range(generator.size()):\n", + " detections = all_detections[i][label]\n", + " annotations = all_annotations[i][label]\n", + " num_annotations += annotations.shape[0]\n", + " detected_annotations = []\n", + "\n", + " for d in detections:\n", + " scores = np.append(scores, d[4])\n", + "\n", + " if annotations.shape[0] == 0:\n", + " false_positives = np.append(false_positives, 1)\n", + " true_positives = np.append(true_positives, 0)\n", + " continue\n", + "\n", + " overlaps = compute_overlap(np.expand_dims(d, axis=0), annotations)\n", + " assigned_annotation = np.argmax(overlaps, axis=1)\n", + " max_overlap = overlaps[0, assigned_annotation]\n", + "\n", + " if max_overlap >= iou_threshold and assigned_annotation not in detected_annotations:\n", + " false_positives = np.append(false_positives, 0)\n", + " true_positives = np.append(true_positives, 1)\n", + " detected_annotations.append(assigned_annotation)\n", + " else:\n", + " false_positives = np.append(false_positives, 1)\n", + " true_positives = np.append(true_positives, 0)\n", + "\n", + " # no annotations -> AP for this class is 0 (is this correct?)\n", + " if num_annotations == 0:\n", + " average_precisions[label] = 0\n", + " continue\n", + "\n", + " # sort by score\n", + " indices = np.argsort(-scores)\n", + " false_positives = false_positives[indices]\n", + " true_positives = true_positives[indices]\n", + "\n", + " # compute false positives and true positives\n", + " false_positives = np.cumsum(false_positives)\n", + " true_positives = np.cumsum(true_positives)\n", + "\n", + " # compute recall and precision\n", + " recall = true_positives / num_annotations\n", + " precision = true_positives / np.maximum(true_positives + false_positives, np.finfo(np.float64).eps)\n", + "\n", + " # compute average precision\n", + " average_precision = compute_ap(recall, precision) \n", + " average_precisions[label] = average_precision\n", + "\n", + " return average_precisions \n", + "\n", + " def predict(self, image):\n", + " image_h, image_w, _ = image.shape\n", + " image = cv2.resize(image, (self.input_size, self.input_size))\n", + " image = self.feature_extractor.normalize(image)\n", + "\n", + " input_image = image[:,:,::-1]\n", + " input_image = np.expand_dims(input_image, 0)\n", + " dummy_array = np.zeros((1,1,1,1,self.max_box_per_image,4))\n", + "\n", + " netout = self.model.predict([input_image, dummy_array])[0]\n", + " boxes = decode_netout(netout, self.anchors, self.nb_class)\n", + "\n", + " return boxes" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Crear las anclas" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "ExecuteTime": { + "end_time": "2020-06-18T21:18:13.677787Z", + "start_time": "2020-06-18T21:18:13.625911Z" + } + }, + "outputs": [], + "source": [ + "import random\n", + "\n", + "num_anchors = 5\n", + "\n", + "def IOU(ann, centroids):\n", + " w, h = ann\n", + " similarities = []\n", + "\n", + " for centroid in centroids:\n", + " c_w, c_h = centroid\n", + "\n", + " if c_w >= w and c_h >= h:\n", + " similarity = w*h/(c_w*c_h)\n", + " elif c_w >= w and c_h <= h:\n", + " similarity = w*c_h/(w*h + (c_w-w)*c_h)\n", + " elif c_w <= w and c_h >= h:\n", + " similarity = c_w*h/(w*h + c_w*(c_h-h))\n", + " else: #means both w,h are bigger than c_w and c_h respectively\n", + " similarity = (c_w*c_h)/(w*h)\n", + " similarities.append(similarity) # will become (k,) shape\n", + "\n", + " return np.array(similarities)\n", + "\n", + "def avg_IOU(anns, centroids):\n", + " n,d = anns.shape\n", + " sum = 0.\n", + "\n", + " for i in range(anns.shape[0]):\n", + " sum+= max(IOU(anns[i], centroids))\n", + "\n", + " return sum/n\n", + "\n", + "def print_anchors(centroids):\n", + " anchors = centroids.copy()\n", + "\n", + " widths = anchors[:, 0]\n", + " sorted_indices = np.argsort(widths)\n", + "\n", + " r = \"anchors: [\"\n", + " for i in sorted_indices[:-1]:\n", + " r += '%0.2f,%0.2f, ' % (anchors[i,0], anchors[i,1])\n", + "\n", + " #there should not be comma after last anchor, that's why\n", + " r += '%0.2f,%0.2f' % (anchors[sorted_indices[-1:],0], anchors[sorted_indices[-1:],1])\n", + " r += \"]\"\n", + "\n", + " print(r)\n", + "\n", + "def run_kmeans(ann_dims, anchor_num):\n", + " ann_num = ann_dims.shape[0]\n", + " iterations = 0\n", + " prev_assignments = np.ones(ann_num)*(-1)\n", + " iteration = 0\n", + " old_distances = np.zeros((ann_num, anchor_num))\n", + "\n", + " indices = [random.randrange(ann_dims.shape[0]) for i in range(anchor_num)]\n", + " centroids = ann_dims[indices]\n", + " anchor_dim = ann_dims.shape[1]\n", + "\n", + " while True:\n", + " distances = []\n", + " iteration += 1\n", + " for i in range(ann_num):\n", + " d = 1 - IOU(ann_dims[i], centroids)\n", + " distances.append(d)\n", + " distances = np.array(distances) # distances.shape = (ann_num, anchor_num)\n", + "\n", + " print(\"iteration {}: dists = {}\".format(iteration, np.sum(np.abs(old_distances-distances))))\n", + "\n", + " #assign samples to centroids\n", + " assignments = np.argmin(distances,axis=1)\n", + "\n", + " if (assignments == prev_assignments).all() :\n", + " return centroids\n", + "\n", + " #calculate new centroids\n", + " centroid_sums=np.zeros((anchor_num, anchor_dim), np.float)\n", + " for i in range(ann_num):\n", + " centroid_sums[assignments[i]]+=ann_dims[i]\n", + " for j in range(anchor_num):\n", + " centroids[j] = centroid_sums[j]/(np.sum(assignments==j) + 1e-6)\n", + "\n", + " prev_assignments = assignments.copy()\n", + " old_distances = distances.copy()\n", + "\n", + "grid_w = tamanio/32\n", + "grid_h = tamanio/32\n", + "\n", + "# run k_mean to find the anchors\n", + "annotation_dims = []\n", + "for image in train_imgs:\n", + " cell_w = image['width']/grid_w\n", + " cell_h = image['height']/grid_h\n", + "\n", + " for obj in image['object']:\n", + " relative_w = (float(obj['xmax']) - float(obj['xmin']))/cell_w\n", + " relatice_h = (float(obj[\"ymax\"]) - float(obj['ymin']))/cell_h\n", + " annotation_dims.append(tuple(map(float, (relative_w,relatice_h))))\n", + "\n", + "annotation_dims = np.array(annotation_dims)\n", + "centroids = run_kmeans(annotation_dims, num_anchors)\n", + "\n", + "# write anchors to file\n", + "print('\\naverage IOU for', num_anchors, 'anchors:', '%0.2f' % avg_IOU(annotation_dims, centroids))\n", + "print_anchors(centroids)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "ExecuteTime": { + "end_time": "2020-06-18T21:18:14.640275Z", + "start_time": "2020-06-18T21:18:14.603606Z" + } + }, + "outputs": [], + "source": [ + "#aproximado para lego [1.90,3.02, 3.05,5.06, 4.35,2.91, 4.66,7.49, 7.24,10.12]\n", + "anchors = []\n", + "for x in centroids:\n", + " anchors.append(x[0])\n", + " anchors.append(x[1])\n", + "anchors" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Y... a entrenar la Red Neuronal!" + ] + }, + { + "cell_type": "code", + "execution_count": 50, + "metadata": { + "ExecuteTime": { + "end_time": "2020-06-18T21:18:31.392052Z", + "start_time": "2020-06-18T21:18:18.684936Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(13, 13)\n", + "____________________________________________________________________________________________________\n", + "Layer (type) Output Shape Param # Connected to \n", + "====================================================================================================\n", + "input_7 (InputLayer) (None, 416, 416, 3) 0 \n", + "____________________________________________________________________________________________________\n", + "model_5 (Model) (None, 13, 13, 1024) 50547936 input_7[0][0] \n", + "____________________________________________________________________________________________________\n", + "DetectionLayer (Conv2D) (None, 13, 13, 30) 30750 model_5[1][0] \n", + "____________________________________________________________________________________________________\n", + "reshape_3 (Reshape) (None, 13, 13, 5, 6) 0 DetectionLayer[0][0] \n", + "____________________________________________________________________________________________________\n", + "input_8 (InputLayer) (None, 1, 1, 1, 5, 4) 0 \n", + "____________________________________________________________________________________________________\n", + "lambda_6 (Lambda) (None, 13, 13, 5, 6) 0 reshape_3[0][0] \n", + " input_8[0][0] \n", + "====================================================================================================\n", + "Total params: 50,578,686\n", + "Trainable params: 50,558,014\n", + "Non-trainable params: 20,672\n", + "____________________________________________________________________________________________________\n" + ] + } + ], + "source": [ + "# instanciamos al modelo\n", + "yolo = YOLO(input_size = tamanio, \n", + " labels = labels, \n", + " max_box_per_image = 5,\n", + " anchors = anchors)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "ATENCION: al entrenar en mi ordenador CPU 4 núcleos y 8GB ram \n", + "\n", + "con 6 epochs\n", + "\n", + "puede tomar unas 7 horas" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "ExecuteTime": { + "end_time": "2020-06-18T21:27:38.931042Z", + "start_time": "2020-06-18T21:19:36.249340Z" + } + }, + "outputs": [], + "source": [ + "yolo.train(train_imgs = train_imgs,\n", + " valid_imgs = valid_imgs,\n", + " train_times = 6,\n", + " valid_times = 1,\n", + " nb_epochs = 6, \n", + " learning_rate = 1e-4, \n", + " batch_size = 8,\n", + " warmup_epochs = 2,\n", + " object_scale = 5,\n", + " no_object_scale = 1,\n", + " coord_scale = 1,\n", + " class_scale = 1,\n", + " saved_weights_name = mejores_pesos,\n", + " debug = True)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Prediccion de 1 imagen que la red no ha visto nunca" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": { + "ExecuteTime": { + "end_time": "2020-06-18T18:39:23.469061Z", + "start_time": "2020-06-18T18:39:23.458089Z" + } + }, + "outputs": [], + "source": [ + "def draw_boxes(image, boxes, labels):\n", + " image_h, image_w, _ = image.shape\n", + "\n", + " for box in boxes:\n", + " xmin = int(box.xmin*image_w)\n", + " ymin = int(box.ymin*image_h)\n", + " xmax = int(box.xmax*image_w)\n", + " ymax = int(box.ymax*image_h)\n", + "\n", + " cv2.rectangle(image, (xmin,ymin), (xmax,ymax), (0,255,0), 3)\n", + " cv2.putText(image, \n", + " labels[box.get_label()] + ' ' + str(box.get_score()), \n", + " (xmin, ymin - 13), \n", + " cv2.FONT_HERSHEY_SIMPLEX, \n", + " 1e-3 * image_h, \n", + " (0,255,0), 2)\n", + " \n", + " return image \n" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": { + "ExecuteTime": { + "end_time": "2020-06-18T18:40:04.330466Z", + "start_time": "2020-06-18T18:39:52.375770Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(13, 13)\n", + "____________________________________________________________________________________________________\n", + "Layer (type) Output Shape Param # Connected to \n", + "====================================================================================================\n", + "input_4 (InputLayer) (None, 416, 416, 3) 0 \n", + "____________________________________________________________________________________________________\n", + "model_3 (Model) (None, 13, 13, 1024) 50547936 input_4[0][0] \n", + "____________________________________________________________________________________________________\n", + "DetectionLayer (Conv2D) (None, 13, 13, 30) 30750 model_3[1][0] \n", + "____________________________________________________________________________________________________\n", + "reshape_2 (Reshape) (None, 13, 13, 5, 6) 0 DetectionLayer[0][0] \n", + "____________________________________________________________________________________________________\n", + "input_5 (InputLayer) (None, 1, 1, 1, 5, 4) 0 \n", + "____________________________________________________________________________________________________\n", + "lambda_4 (Lambda) (None, 13, 13, 5, 6) 0 reshape_2[0][0] \n", + " input_5[0][0] \n", + "====================================================================================================\n", + "Total params: 50,578,686\n", + "Trainable params: 50,558,014\n", + "Non-trainable params: 20,672\n", + "____________________________________________________________________________________________________\n", + "Detectados 1\n" + ] + } + ], + "source": [ + "mejores_pesos = \"red_lego.h5\"\n", + "\n", + "image_path = \"images/test/lego_girl.png\"\n", + "\n", + "mi_yolo = YOLO(input_size = tamanio, \n", + " labels = labels, \n", + " max_box_per_image = 5,\n", + " anchors = anchors)\n", + "\n", + "mi_yolo.load_weights(mejores_pesos)\n", + "\n", + "image = cv2.imread(image_path)\n", + "boxes = mi_yolo.predict(image)\n", + "image = draw_boxes(image, boxes, labels)\n", + "\n", + "print('Detectados', len(boxes))\n", + "\n", + "cv2.imwrite(image_path[:-4] + '_detected' + image_path[-4:], image)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Y otro ejemplo sobre el archivo lego_misc.png:" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "ExecuteTime": { + "end_time": "2020-06-14T23:30:01.947364Z", + "start_time": "2020-06-14T23:30:01.938602Z" + } + }, + "source": [ + "" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# En un video mp4" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": { + "ExecuteTime": { + "end_time": "2020-06-18T19:24:19.281673Z", + "start_time": "2020-06-18T19:14:22.501179Z" + } + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 791/791 [09:56<00:00, 1.33it/s]\n" + ] + } + ], + "source": [ + "from tqdm import *\n", + "\n", + "video_path = 'images/test/lego_movie.mp4'\n", + "video_out = video_path[:-4] + '_detected' + video_path[-4:]\n", + "video_reader = cv2.VideoCapture(video_path)\n", + "\n", + "nb_frames = int(video_reader.get(cv2.CAP_PROP_FRAME_COUNT))\n", + "frame_h = int(video_reader.get(cv2.CAP_PROP_FRAME_HEIGHT))\n", + "frame_w = int(video_reader.get(cv2.CAP_PROP_FRAME_WIDTH))\n", + "\n", + "video_writer = cv2.VideoWriter(video_out,\n", + " cv2.VideoWriter_fourcc(*'MPEG'), \n", + " 50.0, \n", + " (frame_w, frame_h))\n", + "\n", + "for i in tqdm(range(nb_frames)):\n", + " _, image = video_reader.read()\n", + " \n", + " boxes = mi_yolo.predict(image)\n", + " image = draw_boxes(image, boxes, labels)\n", + "\n", + " video_writer.write(np.uint8(image))\n", + "\n", + "video_reader.release()\n", + "video_writer.release()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Desde tu cámara Web" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## (Para salir, presiona \"q\")" + ] + }, + { + "cell_type": "code", + "execution_count": 59, + "metadata": { + "ExecuteTime": { + "end_time": "2020-06-18T16:16:57.933534Z", + "start_time": "2020-06-18T16:16:41.446306Z" + } + }, + "outputs": [], + "source": [ + "win_name = 'Lego detection'\n", + "cv2.namedWindow(win_name)\n", + "\n", + "video_reader = cv2.VideoCapture(0)\n", + "\n", + "while True:\n", + " _, image = video_reader.read()\n", + " \n", + " boxes = mi_yolo.predict(image)\n", + " image = draw_boxes(image, boxes, labels)\n", + "\n", + " cv2.imshow(win_name, image)\n", + "\n", + " key = cv2.waitKey(1) & 0xFF\n", + " if key == ord('q'):\n", + " break\n", + "\n", + "cv2.destroyAllWindows()\n", + "video_reader.release()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "El artículo completo en www.aprendemachinelearning.com" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.7" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +}