detectors/yolo_openvino.py

import cv2
import time
import numpy as np

import time
import logging
import os
import sys
from math import exp as exp

from openvino.inference_engine import IECore

import ngraph as ng
sys.path.append(os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), 'common'))


logging.basicConfig(format="[ %(levelname)s ] %(message)s", level=logging.INFO, stream=sys.stdout)
log = logging.getLogger()


class YoloParams:
    # ------------------------------------------- Extracting layer parameters ------------------------------------------
    # Magic numbers are copied from yolo samples
    def __init__(self, param, side):
        self.num = 3 if 'num' not in param else int(param['num'])
        self.coords = 4 if 'coords' not in param else int(param['coords'])
        self.classes = 80 if 'classes' not in param else int(param['classes'])
        self.side = side
        self.anchors = [10.0, 13.0, 16.0, 30.0, 33.0, 23.0, 30.0, 61.0, 62.0, 45.0, 59.0, 119.0, 116.0, 90.0, 156.0,
                        198.0, 373.0, 326.0] if 'anchors' not in param else param['anchors']

        self.isYoloV3 = False

        if param.get('mask'):
            mask = param['mask']
            self.num = len(mask)

            maskedAnchors = []
            for idx in mask:
                maskedAnchors += [self.anchors[idx * 2], self.anchors[idx * 2 + 1]]
            self.anchors = maskedAnchors

            self.isYoloV3 = True # Weak way to determine but the only one.


class OpenvinoYOLO(object):
    def __init__(self, model_path, device, iou_threshold, score_threshold, num_streams="", cpu_extension=None, number_threads=1, keep_aspect_ratio=False):
        self.keep_aspect_ratio = keep_aspect_ratio
        self.prob_threshold = score_threshold
        self.iou_threshold = iou_threshold
        # ------------- 1. Plugin initialization for specified device and load extensions library if specified -------------
        log.info("Creating Inference Engine...")
        ie = IECore()

        config_user_specified = {}

        devices_nstreams = {}
        if num_streams:
            devices_nstreams = {dev: num_streams for dev in ['CPU', 'GPU'] if dev in device} \
                            if num_streams.isdigit() \
                            else dict([dev.split(':') for dev in num_streams.split(',')])

        if 'CPU' in device:
            if cpu_extension:
                ie.add_extension(cpu_extension, 'CPU')
            if number_threads is not None:
                config_user_specified['CPU_THREADS_NUM'] = str(number_threads)
            if 'CPU' in devices_nstreams:
                config_user_specified['CPU_THROUGHPUT_STREAMS'] = devices_nstreams['CPU'] \
                                                                if int(devices_nstreams['CPU']) > 0 \
                                                                else 'CPU_THROUGHPUT_AUTO'

        if 'GPU' in device:
            if 'GPU' in devices_nstreams:
                config_user_specified['GPU_THROUGHPUT_STREAMS'] = devices_nstreams['GPU'] \
                                                                if int(devices_nstreams['GPU']) > 0 \
                                                                else 'GPU_THROUGHPUT_AUTO'
        # -------------------- 2. Reading the IR generated by the Model Optimizer (.xml and .bin files) --------------------
        log.info("Loading network")
        self.net = ie.read_network(model_path, os.path.splitext(model_path)[0] + ".bin")
        # ---------------------------------- 3. Load CPU extension for support specific layer ------------------------------
        # ---------------------------------------------- 4. Preparing inputs -----------------------------------------------
        log.info("Preparing inputs")
        self.input_blob = next(iter(self.net.input_info))

        # Read and pre-process input images
        if self.net.input_info[self.input_blob].input_data.shape[1] == 3:
            self.input_height, self.input_width = self.net.input_info[self.input_blob].input_data.shape[2:]
            self.nchw_shape = True
        else:
            self.input_height, self.input_width = self.net.input_info[self.input_blob].input_data.shape[1:3]
            self.nchw_shape = False
        # ----------------------------------------- 5. Loading model to the plugin -----------------------------------------
        log.info("Loading model to the plugin")

        self.network = ie.load_network(network=self.net, device_name=device, config=config_user_specified, num_requests=1)
        log.info("OpenVINO model loaded!")


    def _resize(self, image, size, interpolation=cv2.INTER_LINEAR):
        if not self.keep_aspect_ratio:
            return cv2.resize(image, size, interpolation=interpolation)

        iw, ih = image.shape[0:2][::-1]
        w, h = size
        scale = min(w/iw, h/ih)
        nw = int(iw*scale)
        nh = int(ih*scale)
        image = cv2.resize(image, (nw, nh), interpolation=interpolation)
        new_image = np.full((size[1], size[0], 3), 128, dtype=np.uint8)
        dx = (w-nw)//2
        dy = (h-nh)//2
        new_image[dy:dy+nh, dx:dx+nw, :] = image
        return new_image


    def _preprocess(self, frame):
        in_frame = self._resize(frame, (self.input_width, self.input_height))
        if self.nchw_shape:
            in_frame = in_frame.transpose((2, 0, 1))  # Change data layout from HWC to CHW
        in_frame = np.expand_dims(in_frame, axis=0)
        return in_frame


    def _scale_bbox(self, x, y, height, width, class_id, confidence, im_h, im_w, is_proportional):
        if is_proportional:
            scale = np.array([min(im_w/im_h, 1), min(im_h/im_w, 1)])
            offset = 0.5*(np.ones(2) - scale)
            x, y = (np.array([x, y]) - offset) / scale
            width, height = np.array([width, height]) / scale
        xmin = int((x - width / 2) * im_w)
        ymin = int((y - height / 2) * im_h)
        xmax = int(xmin + width * im_w)
        ymax = int(ymin + height * im_h)
        # Method item() used here to convert NumPy types to native types for compatibility with functions, which don't
        # support Numpy types (e.g., cv2.rectangle doesn't support int64 in color parameter)
        return dict(xmin=xmin, xmax=xmax, ymin=ymin, ymax=ymax, class_id=class_id.item(), confidence=confidence.item())


    def _parse_yolo_region(self, predictions, resized_image_shape, original_im_shape, params, threshold, is_proportional):
        # ------------------------------------------ Validating output parameters ------------------------------------------
        _, _, out_blob_h, out_blob_w = predictions.shape
        assert out_blob_w == out_blob_h, "Invalid size of output blob. It sould be in NCHW layout and height should " \
                                        "be equal to width. Current height = {}, current width = {}" \
                                        "".format(out_blob_h, out_blob_w)

        # ------------------------------------------ Extracting layer parameters -------------------------------------------
        orig_im_h, orig_im_w = original_im_shape
        resized_image_h, resized_image_w = resized_image_shape
        objects = list()
        size_normalizer = (resized_image_w, resized_image_h) if params.isYoloV3 else (params.side, params.side)
        bbox_size = params.coords + 1 + params.classes
        # ------------------------------------------- Parsing YOLO Region output -------------------------------------------
        for row, col, n in np.ndindex(params.side, params.side, params.num):
            # Getting raw values for each detection bounding box
            bbox = predictions[0, n*bbox_size:(n+1)*bbox_size, row, col]
            x, y, width, height, object_probability = bbox[:5]
            class_probabilities = bbox[5:]
            if object_probability < threshold:
                continue
            # Process raw value
            x = (col + x) / params.side
            y = (row + y) / params.side
            # Value for exp is very big number in some cases so following construction is using here
            try:
                width = exp(width)
                height = exp(height)
            except OverflowError:
                continue
            # Depends on topology we need to normalize sizes by feature maps (up to YOLOv3) or by input shape (YOLOv3)
            width = width * params.anchors[2 * n] / size_normalizer[0]
            height = height * params.anchors[2 * n + 1] / size_normalizer[1]

            class_id = np.argmax(class_probabilities)
            confidence = class_probabilities[class_id]*object_probability
            if confidence < threshold:
                continue
            objects.append(self._scale_bbox(x=x, y=y, height=height, width=width, class_id=class_id, confidence=confidence,
                                    im_h=orig_im_h, im_w=orig_im_w, is_proportional=is_proportional))
        return objects


    def _get_objects(self, output, source_height_width):
        objects = list()
        function = ng.function_from_cnn(self.net)
        for layer_name, out_blob in output.items():
            out_blob = out_blob.reshape(self.net.outputs[layer_name].shape)
            params = [x._get_attributes() for x in function.get_ordered_ops() if x.get_friendly_name() == layer_name][0]
            layer_params = YoloParams(params, out_blob.shape[2])
            objects += self._parse_yolo_region(out_blob, (self.input_height, self.input_width), source_height_width, 
                                               layer_params, self.prob_threshold, self.keep_aspect_ratio)

        return objects


    def _intersection_over_union(self, box_1, box_2):
        width_of_overlap_area = min(box_1['xmax'], box_2['xmax']) - max(box_1['xmin'], box_2['xmin'])
        height_of_overlap_area = min(box_1['ymax'], box_2['ymax']) - max(box_1['ymin'], box_2['ymin'])
        if width_of_overlap_area < 0 or height_of_overlap_area < 0:
            area_of_overlap = 0
        else:
            area_of_overlap = width_of_overlap_area * height_of_overlap_area
        box_1_area = (box_1['ymax'] - box_1['ymin']) * (box_1['xmax'] - box_1['xmin'])
        box_2_area = (box_2['ymax'] - box_2['ymin']) * (box_2['xmax'] - box_2['xmin'])
        area_of_union = box_1_area + box_2_area - area_of_overlap
        if area_of_union == 0:
            return 0
        return area_of_overlap / area_of_union


    def _filter_objects(self, objects):
        # Filtering overlapping boxes with respect to the --iou_threshold CLI parameter
        objects = sorted(objects, key=lambda obj : obj['confidence'], reverse=True)
        for i in range(len(objects)):
            if objects[i]['confidence'] == 0:
                continue
            for j in range(i + 1, len(objects)):
                if self._intersection_over_union(objects[i], objects[j]) > self.iou_threshold:
                    objects[j]['confidence'] = 0

        return tuple(obj for obj in objects if obj['confidence'] >= self.prob_threshold)
    

    def _postprocess(self, output, img_shape):
        objects = self._get_objects(output, img_shape)
                                  
        objects = self._filter_objects(objects)

        scores = []
        classes = []
        bboxes = []

        origin_im_size = img_shape
        for obj in objects:
            # Validation bbox of detected object
            obj['xmax'] = min(obj['xmax'], origin_im_size[1])
            obj['ymax'] = min(obj['ymax'], origin_im_size[0])
            obj['xmin'] = max(obj['xmin'], 0)
            obj['ymin'] = max(obj['ymin'], 0)
            
            bboxes.append([obj['xmin'], obj['ymin'], obj['xmax'], obj['ymax']])
            classes.append(obj['class_id'])
            scores.append(obj['confidence'])

        # format bounding boxes from xmin, ymin, xmax, ymax ---> xmin, ymin, width, height
        bboxes = np.array(bboxes)
        xy_min = np.hstack([np.zeros((bboxes.shape[0], 2)), bboxes[:,:2]])
        bboxes = np.subtract(bboxes, xy_min)

        return bboxes, np.array(scores), np.array(classes), np.array(len(classes))


    def detect(self, img):
        # resize input_frame to network size
        in_frame = self._preprocess(img)

        inference_start = time.time()
        output = self.network.infer(inputs={self.input_blob: in_frame})
        inference_stop = time.time()

        bboxes, scores, classes, num_objects = self._postprocess(output, img_shape=img.shape[:-1])

        return bboxes, scores, classes, num_objects, inference_stop-inference_start