Files
simple-keras/yolov3.py
T
2020-11-06 13:46:11 +01:00

619 lines
22 KiB
Python

"""
YOLO v3 object detection with Keras
Source: https://towardsdatascience.com/yolo-v3-object-detection-with-keras-461d2cfccef6
"""
import struct
import glob
import numpy as np
from numpy import expand_dims
from keras.layers import Input, Conv2D, BatchNormalization, LeakyReLU, ZeroPadding2D, UpSampling2D
from keras.models import Model
from keras.layers.merge import add, concatenate
from keras.preprocessing.image import load_img
from keras.preprocessing.image import img_to_array
from matplotlib import pyplot
from matplotlib.patches import Rectangle
# Step 1:
# Define WeightReader class
class WeightReader:
"""
WeightReader class is used to parse the "yolov3.weights" file and load the model weights into
memory in a format that we can set into keras model.
"""
def __init__(self, weight_file):
with open(weight_file, 'rb') as w_f:
major, = struct.unpack('i', w_f.read(4))
minor, = struct.unpack('i', w_f.read(4))
w_f.read(4) # ignore revision
if (major * 10 + minor) >= 2 and major < 1000 and minor < 1000:
w_f.read(8)
else:
w_f.read(4)
binary = w_f.read()
self.offset = 0
self.all_weights = np.frombuffer(binary, dtype='float32')
def read_bytes(self, size):
"""
Helper function to read bytes from all_weights.
"""
self.offset = self.offset + size
return self.all_weights[self.offset - size:self.offset]
def load_weights(self, model):
"""
Load weights into created model.
"""
for i in range(106):
try:
conv_layer = model.get_layer('conv_' + str(i))
print("Loading weights of convolution #" + str(i))
if i not in [81, 93, 105]:
norm_layer = model.get_layer('bnorm_' + str(i))
size = np.prod(norm_layer.get_weights()[0].shape)
beta = self.read_bytes(size) # bias
gamma = self.read_bytes(size) # scale
mean = self.read_bytes(size) # mean
var = self.read_bytes(size) # variance
norm_layer.set_weights([gamma, beta, mean, var])
if len(conv_layer.get_weights()) > 1:
bias = self.read_bytes(np.prod(conv_layer.get_weights()[1].shape))
kernel = self.read_bytes(np.prod(conv_layer.get_weights()[0].shape))
kernel = kernel.reshape(list(reversed(conv_layer.get_weights()[0].shape)))
kernel = kernel.transpose([2,3,1,0])
conv_layer.set_weights([kernel, bias])
else:
kernel = self.read_bytes(np.prod(conv_layer.get_weights()[0].shape))
kernel = kernel.reshape(list(reversed(conv_layer.get_weights()[0].shape)))
kernel = kernel.transpose([2,3,1,0])
conv_layer.set_weights([kernel])
except ValueError:
print("No convolution #" + str(i))
def reset(self):
"""
Resets offset to restart loading weights.
"""
self.offset = 0
# Step 2
def _conv_block(input_layer, convs, skip=True):
"""
Function to create convolutional layer.
"""
tmp = input_layer
count = 0
for conv in convs:
if count == (len(convs) - 2) and skip:
skip_connection = tmp
count += 1
# Peculiar padding as darknet prefer left and top
if conv['stride'] > 1:
tmp = ZeroPadding2D(((1,0),(1,0)))(tmp)
tmp = Conv2D(conv['filter'],
conv['kernel'],
strides=conv['stride'],
# Peculiar padding as darknet prefer left and top
padding='valid' if conv['stride'] > 1 else 'same',
name='conv_' + str(conv['layer_idx']),
use_bias=False if conv['bnorm'] else True)(tmp)
if conv['bnorm']:
tmp = BatchNormalization(epsilon=0.001, name='bnorm_' + str(conv['layer_idx']))(tmp)
if conv['leaky']:
tmp = LeakyReLU(alpha=0.1, name='leaky_' + str(conv['layer_idx']))(tmp)
return add([skip_connection, tmp]) if skip else tmp
def make_yolov3_model():
"""
Function to create layers of convoluational and stack together as a whole yolo model.
"""
input_image = Input(shape=(None, None, 3))
# Layer 0 => 4
tmp = _conv_block(input_image,
[{'filter': 32, 'kernel': 3, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 0},
{'filter': 64, 'kernel': 3, 'stride': 2, 'bnorm': True, 'leaky': True, 'layer_idx': 1},
{'filter': 32, 'kernel': 1, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 2},
{'filter': 64, 'kernel': 3, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 3}])
# Layer 5 => 8
tmp = _conv_block(tmp,
[{'filter': 128, 'kernel': 3, 'stride': 2, 'bnorm': True, 'leaky': True, 'layer_idx': 5},
{'filter': 64, 'kernel': 1, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 6},
{'filter': 128, 'kernel': 3, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 7}])
# Layer 9 => 11
tmp = _conv_block(tmp,
[{'filter': 64, 'kernel': 1, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 9},
{'filter': 128, 'kernel': 3, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 10}])
# Layer 12 => 15
tmp = _conv_block(tmp,
[{'filter': 256, 'kernel': 3, 'stride': 2, 'bnorm': True, 'leaky': True, 'layer_idx': 12},
{'filter': 128, 'kernel': 1, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 13},
{'filter': 256, 'kernel': 3, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 14}])
# Layer 16 => 36
for i in range(7):
tmp = _conv_block(tmp,
[{'filter': 128, 'kernel': 1, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 16+i*3},
{'filter': 256, 'kernel': 3, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 17+i*3}])
skip_36 = tmp
# Layer 37 => 40
tmp = _conv_block(tmp,
[{'filter': 512, 'kernel': 3, 'stride': 2, 'bnorm': True, 'leaky': True, 'layer_idx': 37},
{'filter': 256, 'kernel': 1, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 38},
{'filter': 512, 'kernel': 3, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 39}])
# Layer 41 => 61
for i in range(7):
tmp = _conv_block(tmp,
[{'filter': 256, 'kernel': 1, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 41+i*3},
{'filter': 512, 'kernel': 3, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 42+i*3}])
skip_61 = tmp
# Layer 62 => 65
tmp = _conv_block(tmp,
[{'filter': 1024, 'kernel': 3, 'stride': 2, 'bnorm': True, 'leaky': True, 'layer_idx': 62},
{'filter': 512, 'kernel': 1, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 63},
{'filter': 1024, 'kernel': 3, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 64}])
# Layer 66 => 74
for i in range(3):
tmp = _conv_block(tmp,
[{'filter': 512, 'kernel': 1, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 66+i*3},
{'filter': 1024, 'kernel': 3, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 67+i*3}])
# Layer 75 => 79
tmp = _conv_block(tmp,
[{'filter': 512, 'kernel': 1, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 75},
{'filter': 1024, 'kernel': 3, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 76},
{'filter': 512, 'kernel': 1, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 77},
{'filter': 1024, 'kernel': 3, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 78},
{'filter': 512, 'kernel': 1, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 79}],
skip=False)
# Layer 80 => 82
yolo_82 = _conv_block(tmp,
[{'filter': 1024, 'kernel': 3, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 80},
{'filter': 255, 'kernel': 1, 'stride': 1, 'bnorm': False, 'leaky': False, 'layer_idx': 81}],
skip=False)
# Layer 83 => 86
tmp = _conv_block(tmp,
[{'filter': 256, 'kernel': 1, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 84}],
skip=False)
tmp = UpSampling2D(2)(tmp)
tmp = concatenate([tmp, skip_61])
# Layer 87 => 91
tmp = _conv_block(tmp,
[{'filter': 256, 'kernel': 1, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 87},
{'filter': 512, 'kernel': 3, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 88},
{'filter': 256, 'kernel': 1, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 89},
{'filter': 512, 'kernel': 3, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 90},
{'filter': 256, 'kernel': 1, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 91}],
skip=False)
# Layer 92 => 94
yolo_94 = _conv_block(tmp,
[{'filter': 512, 'kernel': 3, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 92},
{'filter': 255, 'kernel': 1, 'stride': 1, 'bnorm': False, 'leaky': False, 'layer_idx': 93}],
skip=False)
# Layer 95 => 98
tmp = _conv_block(tmp,
[{'filter': 128, 'kernel': 1, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 96}],
skip=False)
tmp = UpSampling2D(2)(tmp)
tmp = concatenate([tmp, skip_36])
# Layer 99 => 106
yolo_106 = _conv_block(tmp,
[{'filter': 128, 'kernel': 1, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 99},
{'filter': 256, 'kernel': 3, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 100},
{'filter': 128, 'kernel': 1, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 101},
{'filter': 256, 'kernel': 3, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 102},
{'filter': 128, 'kernel': 1, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 103},
{'filter': 256, 'kernel': 3, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 104},
{'filter': 255, 'kernel': 1, 'stride': 1, 'bnorm': False, 'leaky': False, 'layer_idx': 105}],
skip=False)
model = Model(input_image, [yolo_82, yolo_94, yolo_106])
return model
# Step 4:
# Prediction
def load_image_pixels(filename, shape):
"""
Loading the image to model and make prediction
"""
# Load image to get its shape
image = load_img(filename)
width, height = image.size
# Load image with required size
image = load_img(filename, target_size=shape)
image = img_to_array(image)
# Grayscale image normalization
image = image.astype('float32')
image /= 255.0
# Add a dimension so that we have one sample
image = expand_dims(image, 0)
return image, width, height
# Step 4: Decode the prediction output to rectangle coordinates
class BoundBox:
"""
BoundBox class is used to return object bounding box coordinates, object name and threshold
score decode_netout` function is used to decode the prediction output to rectangle coordinates
"""
def __init__(self, xmin, ymin, xmax, ymax, objness = None, classes = None):
self.xmin = xmin
self.ymin = ymin
self.xmax = xmax
self.ymax = ymax
self.objness = objness
self.classes = classes
self.label = -1
self.score = -1
def get_label(self):
"""
Gets the label of the current object
"""
if self.label == -1:
self.label = np.argmax(self.classes)
return self.label
def get_score(self):
"""
Gets the score of the current object
"""
if self.score == -1:
self.score = self.classes[self.get_label()]
return self.get_score
def _sigmoid(inp):
return 1. / (1. + np.exp(-inp))
def decode_netout(netout, anchors, obj_thresh, net_h, net_w):
"""
Decode output information of network.
"""
grid_h, grid_w = netout.shape[:2]
nb_box = 3
netout = netout.reshape((grid_h, grid_w, nb_box, -1))
boxes = []
netout[..., :2] = _sigmoid(netout[..., :2])
netout[..., 4:] = _sigmoid(netout[..., 4:])
netout[..., 5:] = netout[..., 4][..., np.newaxis] * netout[..., 5:]
netout[..., 5:] *= netout[..., 5:] > obj_thresh
for i in range(grid_h * grid_w):
row = i / grid_w
col = i % grid_w
for j in range(nb_box):
# 4th element is objectness score
objectness = netout[int(row)][int(col)][j][4]
if objectness.all() <= obj_thresh:
continue
# First 4 elements to for the bounding box are x, y, w, and h
box_x, box_y, box_w, box_h = netout[int(row)][int(col)][j][:4]
box_x = (col + box_x) / grid_w # Center position, unit: image width
box_y = (row + box_y) / grid_h # Center position, unit: image height
box_w = anchors[2 * j + 0] * np.exp(box_w) / net_w # Unit: image width
box_h = anchors[2 * j + 1] * np.exp(box_h) / net_h # Unit: image height
# Last elements are class probabilities
classes = netout[int(row)][col][j][5:]
box = BoundBox(box_x - box_w / 2,
box_y - box_h / 2,
box_x + box_w / 2,
box_y + box_h / 2,
objectness, classes)
boxes.append(box)
return boxes
# Step 5
def correct_yolo_boxes(boxes, image_h, image_w, net_h, net_w):
"""
Strech the box to be fit to the image normal shape
"""
new_w, new_h = net_w, net_h
for box in boxes:
x_offset, x_scale = (net_w - new_w) / 2. / net_w, float(new_w) / net_w
y_offset, y_scale = (net_h - new_h) / 2. / net_h, float(new_h) / net_h
box.xmin = int((box.xmin - x_offset) / x_scale * image_w)
box.xmax = int((box.xmax - x_offset) / x_scale * image_w)
box.ymin = int((box.ymin - y_offset) / y_scale * image_h)
box.ymax = int((box.ymax - y_offset) / y_scale * image_h)
# Step 6
def _interval_overlap(interval_a, interval_b):
"""
Implementing Intersection over Unit (IoU)
Source: https://www.pyimagesearch.com/2016/11/07/intersection-over-union-iou-for-object-detection/
https://medium.com/@amrokamal_47691/yolo-yolov2-and-yolov3-all-you-want-to-know-7e3e92dc4899
"""
x_1, x_2 = interval_a
x_3, x_4 = interval_b
if x_3 < x_1:
if x_4 < x_1:
ret = 0
else:
ret = min(x_2, x_4) - x_1
else:
if x_2 < x_3:
ret = 0
else:
ret = min(x_2, x_4) - x_3
return ret
def bbox_iou(box1, box2):
"""
TODO
"""
intersect_w = _interval_overlap([box1.xmin, box1.xmax], [box2.xmin, box2.xmax])
intersect_h = _interval_overlap([box1.ymin, box1.ymax], [box2.ymin, box2.ymax])
intersect = intersect_w * intersect_h
w_1, h_1 = box1.xmax - box1.xmin, box1.ymax - box1.ymin
w_2, h_2 = box2.xmax - box2.xmin, box2.ymax - box2.ymin
union = w_1 * h_1 + w_2 * h_2 - intersect
return float(intersect) / union
def do_nms(boxes, nms_thresh):
"""
TODO
"""
if len(boxes) > 0:
nb_classes = len(boxes[0].classes)
else:
return
for nb_class in range(nb_classes):
sorted_indices = np.argsort([-box.classes[nb_class] for box in boxes])
for i, index_i in enumerate(sorted_indices):
if boxes[index_i].classes[nb_class] == 0:
continue
for j in range(i+1, len(sorted_indices)):
index_j = sorted_indices[j]
if bbox_iou(boxes[index_i], boxes[index_j]) >= nms_thresh:
boxes[index_j].classes[nb_class] = 0
def get_boxes(boxes, labels, thresh):
"""
Get all of the results above a threshold
"""
v_boxes, v_labels, v_scores = list(), list(), list()
# Enumerate all boxes
for box in boxes:
# Enumerate all possible labels
for i, label in enumerate(labels):
# Check if the threshold for this label is high enough
if box.classes[i] > thresh:
v_boxes.append(box)
v_labels.append(label)
v_scores.append(box.classes[i] * 100)
# Don't break, many labels may trigger for one box
return v_boxes, v_labels, v_scores
def draw_boxes(filename, v_boxes, v_labels, v_scores):
"""
Draw all results
"""
# Load the image
data = pyplot.imread(filename)
# Plot the image
pyplot.imshow(data)
# Get the context for drawing boxes
axes = pyplot.gca()
# Plot each box
for i, box in enumerate(v_boxes):
# Get coordinates
y_1, x_1, y_2, x_2 = box.ymin, box.xmin, box.ymax, box.xmax
# Calculate width and height of the box
width, height = x_2 - x_1, y_2 - y_1
# Create the shape
rect = Rectangle((x_1, y_1), width, height, fill=False, color='red', linewidth = '2')
# Draw the box
axes.add_patch(rect)
# Draw text and score in top left corner
label = "%s (%.3f)" % (v_labels[i], v_scores[i])
pyplot.text(x_1, y_1, label, color='white', backgroundcolor='red')
# Show the plot
pyplot.show()
# Step 7:
# Dclare several configurationd
# Define the anchors
ANCHORS = [[116,90, 156,198, 373,326], [30,61, 62,45, 59,119], [10,13, 16,30, 33,23]]
# Define thLOe probability threshold for detected objects
CLASS_THRESHOLD = 0.6
# Define the labels
LABELS = ["person", # 0
"bicycle",
"car",
"motorbike",
"aeroplane",
"bus", # 5
"train",
"truck",
"boat",
"traffic light",
"fire hydrant", # 10
"stop sign",
"parking meter",
"bench",
"bird",
"cat", # 15
"dog",
"horse",
"sheep",
"cow",
"elephant", # 20
"bear",
"zebra",
"giraffe",
"backpack",
"umbrella", # 25
"handbag",
"tie",
"suitcase",
"frisbee",
"skis", # 30
"snowboard",
"sports ball",
"kite",
"baseball bat",
"baseball glove", # 35
"skateboard",
"surfboard",
"tennis racket",
"bottle",
"wine glass", # 40
"cup",
"fork",
"knife",
"spoon",
"bowl", # 45
"banana",
"apple",
"sandwich",
"orange",
"broccoli", # 50
"carrot",
"hot dog",
"pizza",
"donut",
"cake", # 55
"chair",
"sofa",
"pottedplant",
"bed",
"diningtable", # 60
"toilet",
"tvmonitor",
"laptop",
"mouse",
"remote", # 65
"keyboard",
"cell phone",
"microwave",
"oven",
"toaster", # 70
"sink",
"refrigerator",
"book",
"clock",
"vase", # 75
"scissors",
"teddy bear",
"hair drier",
"toothbrush"]
def make_prediction(model):
"""
Execute predictions with YOLO v3.
"""
for photo_filename in glob.glob("images/test/motorbike/images2.jpg"):
# Define the expected input shape for the model
input_w, input_h = 416, 416
image, image_w, image_h = load_image_pixels(photo_filename, (input_w, input_h))
# Make prediction
netouts = model.predict(image)
# Summarize the shape of the list of arrays
print([a.shape for a in netouts])
boxes = list()
for i, netout in enumerate(netouts):
# Decode the output of the network
boxes += decode_netout(netout[0], ANCHORS[i], CLASS_THRESHOLD, input_h, input_w)
# Correct the sizes of the bounding boxes for the shape of the image
correct_yolo_boxes(boxes, image_h, image_w, input_h, input_w)
# Suppress non-maximal boxes
do_nms(boxes, 0.5)
# Get the details of the detected objects
v_boxes, v_labels, v_scores = get_boxes(boxes, LABELS, CLASS_THRESHOLD)
# Summarize what we found
for i in range(len(v_boxes)):
print(v_labels[i], v_scores[i])
# Draw what we found
draw_boxes(photo_filename, v_boxes, v_labels, v_scores)
def main():
"""
Defined starting point of source code.
"""
# Step 3:
# (1) Define the model
# (2) Load the weight
# (3) Save the model
# Define the YOLO v3 model
yolov3 = make_yolov3_model()
print(yolov3.summary())
# Load the weights
# Source: https://pjreddie.com/media/files/yolov3.weights
weight_reader = WeightReader('yolov3.weights')
# Set the weights
weight_reader.load_weights(yolov3)
# Save the model to file
# yolov3.trainable = False
yolov3.save('yolov3')
# Step 8:
# Make Prediction
make_prediction(yolov3)
if __name__ == "__main__":
main()