""" YOLO v3 object detection with Keras Source: https://towardsdatascience.com/yolo-v3-object-detection-with-keras-461d2cfccef6 """ import struct import glob import numpy as np from numpy import expand_dims from keras.layers import Input, Conv2D, BatchNormalization, LeakyReLU, ZeroPadding2D, UpSampling2D from keras.models import Model from keras.layers.merge import add, concatenate from keras.preprocessing.image import load_img from keras.preprocessing.image import img_to_array from matplotlib import pyplot from matplotlib.patches import Rectangle # Step 1: # Define WeightReader class class WeightReader: """ WeightReader class is used to parse the "yolov3.weights" file and load the model weights into memory in a format that we can set into keras model. """ def __init__(self, weight_file): with open(weight_file, 'rb') as w_f: major, = struct.unpack('i', w_f.read(4)) minor, = struct.unpack('i', w_f.read(4)) w_f.read(4) # ignore revision if (major * 10 + minor) >= 2 and major < 1000 and minor < 1000: w_f.read(8) else: w_f.read(4) binary = w_f.read() self.offset = 0 self.all_weights = np.frombuffer(binary, dtype='float32') def read_bytes(self, size): """ Helper function to read bytes from all_weights. """ self.offset = self.offset + size return self.all_weights[self.offset - size:self.offset] def load_weights(self, model): """ Load weights into created model. """ for i in range(106): try: conv_layer = model.get_layer('conv_' + str(i)) print("Loading weights of convolution #" + str(i)) if i not in [81, 93, 105]: norm_layer = model.get_layer('bnorm_' + str(i)) size = np.prod(norm_layer.get_weights()[0].shape) beta = self.read_bytes(size) # bias gamma = self.read_bytes(size) # scale mean = self.read_bytes(size) # mean var = self.read_bytes(size) # variance norm_layer.set_weights([gamma, beta, mean, var]) if len(conv_layer.get_weights()) > 1: bias = self.read_bytes(np.prod(conv_layer.get_weights()[1].shape)) kernel = self.read_bytes(np.prod(conv_layer.get_weights()[0].shape)) kernel = kernel.reshape(list(reversed(conv_layer.get_weights()[0].shape))) kernel = kernel.transpose([2,3,1,0]) conv_layer.set_weights([kernel, bias]) else: kernel = self.read_bytes(np.prod(conv_layer.get_weights()[0].shape)) kernel = kernel.reshape(list(reversed(conv_layer.get_weights()[0].shape))) kernel = kernel.transpose([2,3,1,0]) conv_layer.set_weights([kernel]) except ValueError: print("No convolution #" + str(i)) def reset(self): """ Resets offset to restart loading weights. """ self.offset = 0 # Step 2 def _conv_block(input_layer, convs, skip=True): """ Function to create convolutional layer. """ tmp = input_layer count = 0 for conv in convs: if count == (len(convs) - 2) and skip: skip_connection = tmp count += 1 # Peculiar padding as darknet prefer left and top if conv['stride'] > 1: tmp = ZeroPadding2D(((1,0),(1,0)))(tmp) tmp = Conv2D(conv['filter'], conv['kernel'], strides=conv['stride'], # Peculiar padding as darknet prefer left and top padding='valid' if conv['stride'] > 1 else 'same', name='conv_' + str(conv['layer_idx']), use_bias=False if conv['bnorm'] else True)(tmp) if conv['bnorm']: tmp = BatchNormalization(epsilon=0.001, name='bnorm_' + str(conv['layer_idx']))(tmp) if conv['leaky']: tmp = LeakyReLU(alpha=0.1, name='leaky_' + str(conv['layer_idx']))(tmp) return add([skip_connection, tmp]) if skip else tmp def make_yolov3_model(): """ Function to create layers of convoluational and stack together as a whole yolo model. """ input_image = Input(shape=(None, None, 3)) # Layer 0 => 4 tmp = _conv_block(input_image, [{'filter': 32, 'kernel': 3, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 0}, {'filter': 64, 'kernel': 3, 'stride': 2, 'bnorm': True, 'leaky': True, 'layer_idx': 1}, {'filter': 32, 'kernel': 1, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 2}, {'filter': 64, 'kernel': 3, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 3}]) # Layer 5 => 8 tmp = _conv_block(tmp, [{'filter': 128, 'kernel': 3, 'stride': 2, 'bnorm': True, 'leaky': True, 'layer_idx': 5}, {'filter': 64, 'kernel': 1, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 6}, {'filter': 128, 'kernel': 3, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 7}]) # Layer 9 => 11 tmp = _conv_block(tmp, [{'filter': 64, 'kernel': 1, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 9}, {'filter': 128, 'kernel': 3, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 10}]) # Layer 12 => 15 tmp = _conv_block(tmp, [{'filter': 256, 'kernel': 3, 'stride': 2, 'bnorm': True, 'leaky': True, 'layer_idx': 12}, {'filter': 128, 'kernel': 1, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 13}, {'filter': 256, 'kernel': 3, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 14}]) # Layer 16 => 36 for i in range(7): tmp = _conv_block(tmp, [{'filter': 128, 'kernel': 1, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 16+i*3}, {'filter': 256, 'kernel': 3, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 17+i*3}]) skip_36 = tmp # Layer 37 => 40 tmp = _conv_block(tmp, [{'filter': 512, 'kernel': 3, 'stride': 2, 'bnorm': True, 'leaky': True, 'layer_idx': 37}, {'filter': 256, 'kernel': 1, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 38}, {'filter': 512, 'kernel': 3, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 39}]) # Layer 41 => 61 for i in range(7): tmp = _conv_block(tmp, [{'filter': 256, 'kernel': 1, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 41+i*3}, {'filter': 512, 'kernel': 3, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 42+i*3}]) skip_61 = tmp # Layer 62 => 65 tmp = _conv_block(tmp, [{'filter': 1024, 'kernel': 3, 'stride': 2, 'bnorm': True, 'leaky': True, 'layer_idx': 62}, {'filter': 512, 'kernel': 1, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 63}, {'filter': 1024, 'kernel': 3, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 64}]) # Layer 66 => 74 for i in range(3): tmp = _conv_block(tmp, [{'filter': 512, 'kernel': 1, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 66+i*3}, {'filter': 1024, 'kernel': 3, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 67+i*3}]) # Layer 75 => 79 tmp = _conv_block(tmp, [{'filter': 512, 'kernel': 1, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 75}, {'filter': 1024, 'kernel': 3, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 76}, {'filter': 512, 'kernel': 1, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 77}, {'filter': 1024, 'kernel': 3, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 78}, {'filter': 512, 'kernel': 1, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 79}], skip=False) # Layer 80 => 82 yolo_82 = _conv_block(tmp, [{'filter': 1024, 'kernel': 3, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 80}, {'filter': 255, 'kernel': 1, 'stride': 1, 'bnorm': False, 'leaky': False, 'layer_idx': 81}], skip=False) # Layer 83 => 86 tmp = _conv_block(tmp, [{'filter': 256, 'kernel': 1, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 84}], skip=False) tmp = UpSampling2D(2)(tmp) tmp = concatenate([tmp, skip_61]) # Layer 87 => 91 tmp = _conv_block(tmp, [{'filter': 256, 'kernel': 1, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 87}, {'filter': 512, 'kernel': 3, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 88}, {'filter': 256, 'kernel': 1, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 89}, {'filter': 512, 'kernel': 3, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 90}, {'filter': 256, 'kernel': 1, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 91}], skip=False) # Layer 92 => 94 yolo_94 = _conv_block(tmp, [{'filter': 512, 'kernel': 3, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 92}, {'filter': 255, 'kernel': 1, 'stride': 1, 'bnorm': False, 'leaky': False, 'layer_idx': 93}], skip=False) # Layer 95 => 98 tmp = _conv_block(tmp, [{'filter': 128, 'kernel': 1, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 96}], skip=False) tmp = UpSampling2D(2)(tmp) tmp = concatenate([tmp, skip_36]) # Layer 99 => 106 yolo_106 = _conv_block(tmp, [{'filter': 128, 'kernel': 1, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 99}, {'filter': 256, 'kernel': 3, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 100}, {'filter': 128, 'kernel': 1, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 101}, {'filter': 256, 'kernel': 3, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 102}, {'filter': 128, 'kernel': 1, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 103}, {'filter': 256, 'kernel': 3, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 104}, {'filter': 255, 'kernel': 1, 'stride': 1, 'bnorm': False, 'leaky': False, 'layer_idx': 105}], skip=False) model = Model(input_image, [yolo_82, yolo_94, yolo_106]) return model # Step 4: # Prediction def load_image_pixels(filename, shape): """ Loading the image to model and make prediction """ # Load image to get its shape image = load_img(filename) width, height = image.size # Load image with required size image = load_img(filename, target_size=shape) image = img_to_array(image) # Grayscale image normalization image = image.astype('float32') image /= 255.0 # Add a dimension so that we have one sample image = expand_dims(image, 0) return image, width, height # Step 4: Decode the prediction output to rectangle coordinates class BoundBox: """ BoundBox class is used to return object bounding box coordinates, object name and threshold score decode_netout` function is used to decode the prediction output to rectangle coordinates """ def __init__(self, xmin, ymin, xmax, ymax, objness = None, classes = None): self.xmin = xmin self.ymin = ymin self.xmax = xmax self.ymax = ymax self.objness = objness self.classes = classes self.label = -1 self.score = -1 def get_label(self): """ Gets the label of the current object """ if self.label == -1: self.label = np.argmax(self.classes) return self.label def get_score(self): """ Gets the score of the current object """ if self.score == -1: self.score = self.classes[self.get_label()] return self.get_score def _sigmoid(inp): return 1. / (1. + np.exp(-inp)) def decode_netout(netout, anchors, obj_thresh, net_h, net_w): """ Decode output information of network. """ grid_h, grid_w = netout.shape[:2] nb_box = 3 netout = netout.reshape((grid_h, grid_w, nb_box, -1)) boxes = [] netout[..., :2] = _sigmoid(netout[..., :2]) netout[..., 4:] = _sigmoid(netout[..., 4:]) netout[..., 5:] = netout[..., 4][..., np.newaxis] * netout[..., 5:] netout[..., 5:] *= netout[..., 5:] > obj_thresh for i in range(grid_h * grid_w): row = i / grid_w col = i % grid_w for j in range(nb_box): # 4th element is objectness score objectness = netout[int(row)][int(col)][j][4] if objectness.all() <= obj_thresh: continue # First 4 elements to for the bounding box are x, y, w, and h box_x, box_y, box_w, box_h = netout[int(row)][int(col)][j][:4] box_x = (col + box_x) / grid_w # Center position, unit: image width box_y = (row + box_y) / grid_h # Center position, unit: image height box_w = anchors[2 * j + 0] * np.exp(box_w) / net_w # Unit: image width box_h = anchors[2 * j + 1] * np.exp(box_h) / net_h # Unit: image height # Last elements are class probabilities classes = netout[int(row)][col][j][5:] box = BoundBox(box_x - box_w / 2, box_y - box_h / 2, box_x + box_w / 2, box_y + box_h / 2, objectness, classes) boxes.append(box) return boxes # Step 5 def correct_yolo_boxes(boxes, image_h, image_w, net_h, net_w): """ Strech the box to be fit to the image normal shape """ new_w, new_h = net_w, net_h for box in boxes: x_offset, x_scale = (net_w - new_w) / 2. / net_w, float(new_w) / net_w y_offset, y_scale = (net_h - new_h) / 2. / net_h, float(new_h) / net_h box.xmin = int((box.xmin - x_offset) / x_scale * image_w) box.xmax = int((box.xmax - x_offset) / x_scale * image_w) box.ymin = int((box.ymin - y_offset) / y_scale * image_h) box.ymax = int((box.ymax - y_offset) / y_scale * image_h) # Step 6 def _interval_overlap(interval_a, interval_b): """ Implementing Intersection over Unit (IoU) Source: https://www.pyimagesearch.com/2016/11/07/intersection-over-union-iou-for-object-detection/ https://medium.com/@amrokamal_47691/yolo-yolov2-and-yolov3-all-you-want-to-know-7e3e92dc4899 """ x_1, x_2 = interval_a x_3, x_4 = interval_b if x_3 < x_1: if x_4 < x_1: ret = 0 else: ret = min(x_2, x_4) - x_1 else: if x_2 < x_3: ret = 0 else: ret = min(x_2, x_4) - x_3 return ret def bbox_iou(box1, box2): """ TODO """ intersect_w = _interval_overlap([box1.xmin, box1.xmax], [box2.xmin, box2.xmax]) intersect_h = _interval_overlap([box1.ymin, box1.ymax], [box2.ymin, box2.ymax]) intersect = intersect_w * intersect_h w_1, h_1 = box1.xmax - box1.xmin, box1.ymax - box1.ymin w_2, h_2 = box2.xmax - box2.xmin, box2.ymax - box2.ymin union = w_1 * h_1 + w_2 * h_2 - intersect return float(intersect) / union def do_nms(boxes, nms_thresh): """ TODO """ if len(boxes) > 0: nb_classes = len(boxes[0].classes) else: return for nb_class in range(nb_classes): sorted_indices = np.argsort([-box.classes[nb_class] for box in boxes]) for i, index_i in enumerate(sorted_indices): if boxes[index_i].classes[nb_class] == 0: continue for j in range(i+1, len(sorted_indices)): index_j = sorted_indices[j] if bbox_iou(boxes[index_i], boxes[index_j]) >= nms_thresh: boxes[index_j].classes[nb_class] = 0 def get_boxes(boxes, labels, thresh): """ Get all of the results above a threshold """ v_boxes, v_labels, v_scores = list(), list(), list() # Enumerate all boxes for box in boxes: # Enumerate all possible labels for i, label in enumerate(labels): # Check if the threshold for this label is high enough if box.classes[i] > thresh: v_boxes.append(box) v_labels.append(label) v_scores.append(box.classes[i] * 100) # Don't break, many labels may trigger for one box return v_boxes, v_labels, v_scores def draw_boxes(filename, v_boxes, v_labels, v_scores): """ Draw all results """ # Load the image data = pyplot.imread(filename) # Plot the image pyplot.imshow(data) # Get the context for drawing boxes axes = pyplot.gca() # Plot each box for i, box in enumerate(v_boxes): # Get coordinates y_1, x_1, y_2, x_2 = box.ymin, box.xmin, box.ymax, box.xmax # Calculate width and height of the box width, height = x_2 - x_1, y_2 - y_1 # Create the shape rect = Rectangle((x_1, y_1), width, height, fill=False, color='red', linewidth = '2') # Draw the box axes.add_patch(rect) # Draw text and score in top left corner label = "%s (%.3f)" % (v_labels[i], v_scores[i]) pyplot.text(x_1, y_1, label, color='white', backgroundcolor='red') # Show the plot pyplot.show() # Step 7: # Dclare several configurationd # Define the anchors ANCHORS = [[116,90, 156,198, 373,326], [30,61, 62,45, 59,119], [10,13, 16,30, 33,23]] # Define thLOe probability threshold for detected objects CLASS_THRESHOLD = 0.6 # Define the labels LABELS = ["person", # 0 "bicycle", "car", "motorbike", "aeroplane", "bus", # 5 "train", "truck", "boat", "traffic light", "fire hydrant", # 10 "stop sign", "parking meter", "bench", "bird", "cat", # 15 "dog", "horse", "sheep", "cow", "elephant", # 20 "bear", "zebra", "giraffe", "backpack", "umbrella", # 25 "handbag", "tie", "suitcase", "frisbee", "skis", # 30 "snowboard", "sports ball", "kite", "baseball bat", "baseball glove", # 35 "skateboard", "surfboard", "tennis racket", "bottle", "wine glass", # 40 "cup", "fork", "knife", "spoon", "bowl", # 45 "banana", "apple", "sandwich", "orange", "broccoli", # 50 "carrot", "hot dog", "pizza", "donut", "cake", # 55 "chair", "sofa", "pottedplant", "bed", "diningtable", # 60 "toilet", "tvmonitor", "laptop", "mouse", "remote", # 65 "keyboard", "cell phone", "microwave", "oven", "toaster", # 70 "sink", "refrigerator", "book", "clock", "vase", # 75 "scissors", "teddy bear", "hair drier", "toothbrush"] def make_prediction(model): """ Execute predictions with YOLO v3. """ for photo_filename in glob.glob("images/test/motorbike/images2.jpg"): # Define the expected input shape for the model input_w, input_h = 416, 416 image, image_w, image_h = load_image_pixels(photo_filename, (input_w, input_h)) # Make prediction netouts = model.predict(image) # Summarize the shape of the list of arrays print([a.shape for a in netouts]) boxes = list() for i, netout in enumerate(netouts): # Decode the output of the network boxes += decode_netout(netout[0], ANCHORS[i], CLASS_THRESHOLD, input_h, input_w) # Correct the sizes of the bounding boxes for the shape of the image correct_yolo_boxes(boxes, image_h, image_w, input_h, input_w) # Suppress non-maximal boxes do_nms(boxes, 0.5) # Get the details of the detected objects v_boxes, v_labels, v_scores = get_boxes(boxes, LABELS, CLASS_THRESHOLD) # Summarize what we found for i in range(len(v_boxes)): print(v_labels[i], v_scores[i]) # Draw what we found draw_boxes(photo_filename, v_boxes, v_labels, v_scores) def main(): """ Defined starting point of source code. """ # Step 3: # (1) Define the model # (2) Load the weight # (3) Save the model # Define the YOLO v3 model yolov3 = make_yolov3_model() print(yolov3.summary()) # Load the weights # Source: https://pjreddie.com/media/files/yolov3.weights weight_reader = WeightReader('yolov3.weights') # Set the weights weight_reader.load_weights(yolov3) # Save the model to file # yolov3.trainable = False yolov3.save('yolov3') # Step 8: # Make Prediction make_prediction(yolov3) if __name__ == "__main__": main()