simple-keras/yolov3.py

"""
YOLO v3 object detection with Keras

Source: https://towardsdatascience.com/yolo-v3-object-detection-with-keras-461d2cfccef6
"""
import struct
import glob
import numpy as np
from numpy import expand_dims
from keras.layers import Input, Conv2D, BatchNormalization, LeakyReLU, ZeroPadding2D, UpSampling2D
from keras.models import Model
from keras.layers.merge import add, concatenate
from keras.preprocessing.image import load_img
from keras.preprocessing.image import img_to_array
from matplotlib import pyplot
from matplotlib.patches import Rectangle

# Step 1:
# Define WeightReader class
class WeightReader:
    """
    WeightReader class is used to parse the "yolov3.weights" file and load the model weights into
    memory in a format that we can set into keras model.
    """
    def __init__(self, weight_file):
        with open(weight_file, 'rb') as w_f:
            major,  = struct.unpack('i', w_f.read(4))
            minor,  = struct.unpack('i', w_f.read(4))
            w_f.read(4) # ignore revision

            if (major * 10 + minor) >= 2 and major < 1000 and minor < 1000:
                w_f.read(8)
            else:
                w_f.read(4)

            binary = w_f.read()
        self.offset = 0
        self.all_weights = np.frombuffer(binary, dtype='float32')

    def read_bytes(self, size):
        """
        Helper function to read bytes from all_weights.
        """
        self.offset = self.offset + size

        return self.all_weights[self.offset - size:self.offset]

    def load_weights(self, model):
        """
        Load weights into created model.
        """
        for i in range(106):
            try:
                conv_layer = model.get_layer('conv_' + str(i))
                print("loading weights of convolution #" + str(i))

                if i not in [81, 93, 105]:
                    norm_layer = model.get_layer('bnorm_' + str(i))
                    size = np.prod(norm_layer.get_weights()[0].shape)
                    beta  = self.read_bytes(size) # bias
                    gamma = self.read_bytes(size) # scale
                    mean  = self.read_bytes(size) # mean
                    var   = self.read_bytes(size) # variance
                    norm_layer.set_weights([gamma, beta, mean, var])

                if len(conv_layer.get_weights()) > 1:
                    bias   = self.read_bytes(np.prod(conv_layer.get_weights()[1].shape))
                    kernel = self.read_bytes(np.prod(conv_layer.get_weights()[0].shape))
                    kernel = kernel.reshape(list(reversed(conv_layer.get_weights()[0].shape)))
                    kernel = kernel.transpose([2,3,1,0])
                    conv_layer.set_weights([kernel, bias])
                else:
                    kernel = self.read_bytes(np.prod(conv_layer.get_weights()[0].shape))
                    kernel = kernel.reshape(list(reversed(conv_layer.get_weights()[0].shape)))
                    kernel = kernel.transpose([2,3,1,0])
                    conv_layer.set_weights([kernel])

            except ValueError:
                print("no convolution #" + str(i))

    def reset(self):
        """
        Resets offset to restart loading weights.
        """
        self.offset = 0

# Step 2
def _conv_block(input_layer, convs, skip=True):
    """
    Function to create convolutional layer.
    """
    tmp = input_layer
    count = 0
    for conv in convs:
        if count == (len(convs) - 2) and skip:
            skip_connection = tmp
        count += 1

        # Peculiar padding as darknet prefer left and top
        if conv['stride'] > 1:
            tmp = ZeroPadding2D(((1,0),(1,0)))(tmp)

        tmp = Conv2D(conv['filter'],
                     conv['kernel'],
                     strides=conv['stride'],
                     # Peculiar padding as darknet prefer left and top
                     padding='valid' if conv['stride'] > 1 else 'same',
                     name='conv_' + str(conv['layer_idx']),
                     use_bias=False if conv['bnorm'] else True)(tmp)

        if conv['bnorm']:
            tmp = BatchNormalization(epsilon=0.001, name='bnorm_' + str(conv['layer_idx']))(tmp)
        if conv['leaky']:
            tmp = LeakyReLU(alpha=0.1, name='leaky_' + str(conv['layer_idx']))(tmp)

    return add([skip_connection, tmp]) if skip else tmp

def make_yolov3_model():
    """
    Function to create layers of convoluational and stack together as a whole yolo model.
    """
    input_image = Input(shape=(None, None, 3))

    # Layer  0 => 4
    tmp = _conv_block(input_image,
                      [{'filter': 32, 'kernel': 3, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 0},
                       {'filter': 64, 'kernel': 3, 'stride': 2, 'bnorm': True, 'leaky': True, 'layer_idx': 1},
                       {'filter': 32, 'kernel': 1, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 2},
                       {'filter': 64, 'kernel': 3, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 3}])

    # Layer  5 => 8
    tmp = _conv_block(tmp,
                      [{'filter': 128, 'kernel': 3, 'stride': 2, 'bnorm': True, 'leaky': True, 'layer_idx': 5},
                       {'filter':  64, 'kernel': 1, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 6},
                       {'filter': 128, 'kernel': 3, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 7}])

    # Layer  9 => 11
    tmp = _conv_block(tmp,
                      [{'filter':  64, 'kernel': 1, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 9},
                       {'filter': 128, 'kernel': 3, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 10}])

    # Layer 12 => 15
    tmp = _conv_block(tmp,
                      [{'filter': 256, 'kernel': 3, 'stride': 2, 'bnorm': True, 'leaky': True, 'layer_idx': 12},
                       {'filter': 128, 'kernel': 1, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 13},
                       {'filter': 256, 'kernel': 3, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 14}])

    # Layer 16 => 36
    for i in range(7):
        tmp = _conv_block(tmp,
                          [{'filter': 128, 'kernel': 1, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 16+i*3},
                           {'filter': 256, 'kernel': 3, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 17+i*3}])
    skip_36 = tmp

    # Layer 37 => 40
    tmp = _conv_block(tmp,
                      [{'filter': 512, 'kernel': 3, 'stride': 2, 'bnorm': True, 'leaky': True, 'layer_idx': 37},
                       {'filter': 256, 'kernel': 1, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 38},
                       {'filter': 512, 'kernel': 3, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 39}])

    # Layer 41 => 61
    for i in range(7):
        tmp = _conv_block(tmp,
                          [{'filter': 256, 'kernel': 1, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 41+i*3},
                           {'filter': 512, 'kernel': 3, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 42+i*3}])
    skip_61 = tmp

    # Layer 62 => 65
    tmp = _conv_block(tmp,
                      [{'filter': 1024, 'kernel': 3, 'stride': 2, 'bnorm': True, 'leaky': True, 'layer_idx': 62},
                       {'filter':  512, 'kernel': 1, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 63},
                       {'filter': 1024, 'kernel': 3, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 64}])

    # Layer 66 => 74
    for i in range(3):
        tmp = _conv_block(tmp,
                          [{'filter':  512, 'kernel': 1, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 66+i*3},
                           {'filter': 1024, 'kernel': 3, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 67+i*3}])

    # Layer 75 => 79
    tmp = _conv_block(tmp,
                      [{'filter':  512, 'kernel': 1, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 75},
                       {'filter': 1024, 'kernel': 3, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 76},
                       {'filter':  512, 'kernel': 1, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 77},
                       {'filter': 1024, 'kernel': 3, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 78},
                       {'filter':  512, 'kernel': 1, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 79}],
                      skip=False)

    # Layer 80 => 82
    yolo_82 = _conv_block(tmp,
                          [{'filter': 1024, 'kernel': 3, 'stride': 1, 'bnorm': True,  'leaky': True,  'layer_idx': 80},
                           {'filter':  255, 'kernel': 1, 'stride': 1, 'bnorm': False, 'leaky': False, 'layer_idx': 81}],
                          skip=False)

    # Layer 83 => 86
    tmp = _conv_block(tmp,
                      [{'filter': 256, 'kernel': 1, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 84}],
                      skip=False)

    tmp = UpSampling2D(2)(tmp)
    tmp = concatenate([tmp, skip_61])

    # Layer 87 => 91
    tmp = _conv_block(tmp,
                    [{'filter': 256, 'kernel': 1, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 87},
                     {'filter': 512, 'kernel': 3, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 88},
                     {'filter': 256, 'kernel': 1, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 89},
                     {'filter': 512, 'kernel': 3, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 90},
                     {'filter': 256, 'kernel': 1, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 91}],
                    skip=False)

    # Layer 92 => 94
    yolo_94 = _conv_block(tmp,
                          [{'filter': 512, 'kernel': 3, 'stride': 1, 'bnorm': True,  'leaky': True,  'layer_idx': 92},
                           {'filter': 255, 'kernel': 1, 'stride': 1, 'bnorm': False, 'leaky': False, 'layer_idx': 93}],
                          skip=False)

    # Layer 95 => 98
    tmp = _conv_block(tmp,
                      [{'filter': 128, 'kernel': 1, 'stride': 1, 'bnorm': True, 'leaky': True,   'layer_idx': 96}],
                      skip=False)

    tmp = UpSampling2D(2)(tmp)
    tmp = concatenate([tmp, skip_36])

    # Layer 99 => 106
    yolo_106 = _conv_block(tmp,
                           [{'filter': 128, 'kernel': 1, 'stride': 1, 'bnorm': True,  'leaky': True,  'layer_idx': 99},
                            {'filter': 256, 'kernel': 3, 'stride': 1, 'bnorm': True,  'leaky': True,  'layer_idx': 100},
                            {'filter': 128, 'kernel': 1, 'stride': 1, 'bnorm': True,  'leaky': True,  'layer_idx': 101},
                            {'filter': 256, 'kernel': 3, 'stride': 1, 'bnorm': True,  'leaky': True,  'layer_idx': 102},
                            {'filter': 128, 'kernel': 1, 'stride': 1, 'bnorm': True,  'leaky': True,  'layer_idx': 103},
                            {'filter': 256, 'kernel': 3, 'stride': 1, 'bnorm': True,  'leaky': True,  'layer_idx': 104},
                            {'filter': 255, 'kernel': 1, 'stride': 1, 'bnorm': False, 'leaky': False, 'layer_idx': 105}],
                           skip=False)

    model = Model(input_image, [yolo_82, yolo_94, yolo_106])

    return model

# Step 4:
# Prediction
def load_image_pixels(filename, shape):
    """
    Loading the image to model and make prediction
    """

    # Load image to get its shape
    image = load_img(filename)
    width, height = image.size

    # Load image with required size
    image = load_img(filename, target_size=shape)
    image = img_to_array(image)

    # Grayscale image normalization
    image = image.astype('float32')
    image /= 255.0

    # Add a dimension so that we have one sample
    image = expand_dims(image, 0)
    return image, width, height

# Step 4: Decode the prediction output to rectangle coordinates
class BoundBox:
    """
    BoundBox class is used to return object bounding box coordinates, object name and threshold
    score decode_netout` function is used to decode the prediction output to rectangle coordinates
    """
    def __init__(self, xmin, ymin, xmax, ymax, objness = None, classes = None):
        self.xmin = xmin
        self.ymin = ymin
        self.xmax = xmax
        self.ymax = ymax
        self.objness = objness
        self.classes = classes
        self.label = -1
        self.score = -1

    def get_label(self):
        """
        Gets the label of the current object
        """
        if self.label == -1:
            self.label = np.argmax(self.classes)

        return self.label

    def get_score(self):
        """
        Gets the score of the current object
        """
        if self.score == -1:
            self.score = self.classes[self.get_label()]

        return self.get_score

def _sigmoid(inp):
    return 1. / (1. + np.exp(-inp))

def decode_netout(netout, anchors, obj_thresh, net_h, net_w):
    """
    Decode output information of network.
    """
    grid_h, grid_w = netout.shape[:2]
    nb_box = 3
    netout = netout.reshape((grid_h, grid_w, nb_box, -1))
    boxes = []
    netout[..., :2]  = _sigmoid(netout[..., :2])
    netout[..., 4:]  = _sigmoid(netout[..., 4:])
    netout[..., 5:]  = netout[..., 4][..., np.newaxis] * netout[..., 5:]
    netout[..., 5:] *= netout[..., 5:] > obj_thresh

    for i in range(grid_h * grid_w):
        row = i / grid_w
        col = i % grid_w
        for j in range(nb_box):
            # 4th element is objectness score
            objectness = netout[int(row)][int(col)][j][4]

            if objectness.all() <= obj_thresh:
                continue

            # First 4 elements are x, y, w, and h
            x, y, w, h = netout[int(row)][int(col)][j][:4]
            x = (col + x) / grid_w # Center position, unit: image width
            y = (row + y) / grid_h # Center position, unit: image height
            w = anchors[2 * j + 0] * np.exp(w) / net_w # Unit: image width
            h = anchors[2 * j + 1] * np.exp(h) / net_h # Unit: image height
            # Last elements are class probabilities
            classes = netout[int(row)][col][j][5:]
            box = BoundBox(x - w / 2, y - h / 2, x + w / 2, y + h / 2, objectness, classes)
            boxes.append(box)
    return boxes

# Step 5
def correct_yolo_boxes(boxes, image_h, image_w, net_h, net_w):
    """
    Strech the box to be fit to the image normal shape
    """
    new_w, new_h = net_w, net_h
    for box in boxes:
        x_offset, x_scale = (net_w - new_w) / 2. / net_w, float(new_w) / net_w
        y_offset, y_scale = (net_h - new_h) / 2. / net_h, float(new_h) / net_h

        box.xmin = int((box.xmin - x_offset) / x_scale * image_w)
        box.xmax = int((box.xmax - x_offset) / x_scale * image_w)
        box.ymin = int((box.ymin - y_offset) / y_scale * image_h)
        box.ymax = int((box.ymax - y_offset) / y_scale * image_h)

# Step 6
def _interval_overlap(interval_a, interval_b):
    """
    Implementing IOU

    """
    x1, x2 = interval_a
    x3, x4 = interval_b

    if x3 < x1:
        if x4 < x1:
            return 0
        else:
            return min(x2,x4) - x1
    else:
        if x2 < x3:
            return 0
        else:
            return min(x2,x4) - x3

def bbox_iou(box1, box2):
    """
    TODO
    """
    intersect_w = _interval_overlap([box1.xmin, box1.xmax], [box2.xmin, box2.xmax])
    intersect_h = _interval_overlap([box1.ymin, box1.ymax], [box2.ymin, box2.ymax])
    intersect = intersect_w * intersect_h
    w1, h1 = box1.xmax - box1.xmin, box1.ymax - box1.ymin
    w2, h2 = box2.xmax - box2.xmin, box2.ymax - box2.ymin
    union = w1 * h1 + w2 * h2 - intersect
    return float(intersect) / union

def do_nms(boxes, nms_thresh):
    """
    TODO
    """
    if len(boxes) > 0:
        nb_class = len(boxes[0].classes)
    else:
        return
    for c in range(nb_class):
        sorted_indices = np.argsort([-box.classes[c] for box in boxes])

        for i in range(len(sorted_indices)):
            index_i = sorted_indices[i]

            if boxes[index_i].classes[c] == 0:
                continue

            for j in range(i+1, len(sorted_indices)):
                index_j = sorted_indices[j]

                if bbox_iou(boxes[index_i], boxes[index_j]) >= nms_thresh:
                    boxes[index_j].classes[c] = 0

def get_boxes(boxes, labels, thresh):
    """
    Get all of the results above a threshold
    """
    v_boxes, v_labels, v_scores = list(), list(), list()

    # Enumerate all boxes
    for box in boxes:
        # Enumerate all possible labels
        for i, label in enumerate(labels):
            # Check if the threshold for this label is high enough
            if box.classes[i] > thresh:
                v_boxes.append(box)
                v_labels.append(label)
                v_scores.append(box.classes[i] * 100)
                # Don't break, many labels may trigger for one box

    return v_boxes, v_labels, v_scores

def draw_boxes(filename, v_boxes, v_labels, v_scores):
    """
    Draw all results
    """
    # Load the image
    data = pyplot.imread(filename)
    # Plot the image
    pyplot.imshow(data)
    # Get the context for drawing boxes
    ax = pyplot.gca()
    # Plot each box
    for i, box in enumerate(v_boxes):
        # Get coordinates
        y1, x1, y2, x2 = box.ymin, box.xmin, box.ymax, box.xmax
        # Calculate width and height of the box
        width, height = x2 - x1, y2 - y1
        # Create the shape
        rect = Rectangle((x1, y1), width, height, fill=False, color='red', linewidth = '2')
        # Draw the box
        ax.add_patch(rect)
        # Draw text and score in top left corner
        label = "%s (%.3f)" % (v_labels[i], v_scores[i])
        pyplot.text(x1, y1, label, color='white', backgroundcolor='red')

    # Show the plot
    pyplot.show()

# Step 7:
# Dclare several configurationd

# Define the anchors
ANCHORS = [[116,90, 156,198, 373,326], [30,61, 62,45, 59,119], [10,13, 16,30, 33,23]]

# Define the probability threshold for detected objects
CLASS_THRESHOLD = 0.6

# Define the labels
LABELS = ["person",             # 0
          "bicycle",
          "car",
          "motorbike",
          "aeroplane",
          "bus",                # 5
          "train",
          "truck",
          "boat",
          "traffic light",
          "fire hydrant",       # 10
          "stop sign",
          "parking meter",
          "bench",
          "bird",
          "cat",                # 15
          "dog",
          "horse",
          "sheep",
          "cow",
          "elephant",           # 20
          "bear",
          "zebra",
          "giraffe",
          "backpack",
          "umbrella",           # 25
          "handbag",
          "tie",
          "suitcase",
          "frisbee",
          "skis",               # 30
          "snowboard",
          "sports ball",
          "kite",
          "baseball bat",
          "baseball glove",     # 35
          "skateboard",
          "surfboard",
          "tennis racket",
          "bottle",
          "wine glass",         # 40
          "cup",
          "fork",
          "knife",
          "spoon",
          "bowl",               # 45
          "banana",
          "apple",
          "sandwich",
          "orange",
          "broccoli",           # 50
          "carrot",
          "hot dog",
          "pizza",
          "donut",
          "cake",               # 55
          "chair",
          "sofa",
          "pottedplant",
          "bed",
          "diningtable",        # 60
          "toilet",
          "tvmonitor",
          "laptop",
          "mouse",
          "remote",             # 65
          "keyboard",
          "cell phone",
          "microwave",
          "oven",
          "toaster",            # 70
          "sink",
          "refrigerator",
          "book",
          "clock",
          "vase",               # 75
          "scissors",
          "teddy bear",
          "hair drier",
          "toothbrush"]

def make_prediction(model):
    """
    Execute predictions with YOLO v3.
    """
    for photo_filename in glob.glob("images/test/motorbike/images2.jpg"):
        # Define the expected input shape for the model
        input_w, input_h = 416, 416

        image, image_w, image_h = load_image_pixels(photo_filename, (input_w, input_h))

        # Make prediction
        netouts = model.predict(image)

        # Summarize the shape of the list of arrays
        print([a.shape for a in netouts])

        boxes = list()

        for i, netout in enumerate(netouts):
            # Decode the output of the network
            boxes += decode_netout(netout[0], ANCHORS[i], CLASS_THRESHOLD, input_h, input_w)

        # Correct the sizes of the bounding boxes for the shape of the image
        correct_yolo_boxes(boxes, image_h, image_w, input_h, input_w)

        # Suppress non-maximal boxes
        do_nms(boxes, 0.5)

        # Get the details of the detected objects
        v_boxes, v_labels, v_scores = get_boxes(boxes, LABELS, CLASS_THRESHOLD)

        # Summarize what we found
        for i in range(len(v_boxes)):
            print(v_labels[i], v_scores[i])

        # Draw what we found
        draw_boxes(photo_filename, v_boxes, v_labels, v_scores)

def main():
    """
    Defined starting point of source code.
    """

    # Step 3:
    # (1) Define the model
    # (2) Load the weight
    # (3) Save the model

    # Define the YOLO v3 model
    yolov3 = make_yolov3_model()
    print(yolov3.summary())

    # Load the weights
    # Source: https://pjreddie.com/media/files/yolov3.weights
    weight_reader = WeightReader('yolov3.weights')

    # Set the weights
    weight_reader.load_weights(yolov3)

    # Save the model to file
    yolov3.save('yolov3.h5')

    # Step 8:
    # Make Prediction
    make_prediction(yolov3)

if __name__ == "__main__":
    main()