""" YOLO v3 object detection with Keras Source: https://towardsdatascience.com/yolo-v3-object-detection-with-keras-461d2cfccef6 """ import struct import glob import numpy as np from numpy import expand_dims from keras.layers import Input, Conv2D, BatchNormalization, LeakyReLU, ZeroPadding2D, UpSampling2D from keras.models import Model from keras.layers.merge import add, concatenate from keras.preprocessing.image import load_img from keras.preprocessing.image import img_to_array from matplotlib import pyplot from matplotlib.patches import Rectangle # Step 1: # Define WeightReader class class WeightReader: """ WeightReader class is used to parse the "yolov3.weights" file and load the model weights into memory in a format that we can set into keras model. """ def __init__(self, weight_file): with open(weight_file, 'rb') as w_f: major, = struct.unpack('i', w_f.read(4)) minor, = struct.unpack('i', w_f.read(4)) w_f.read(4) # ignore revision if (major * 10 + minor) >= 2 and major < 1000 and minor < 1000: w_f.read(8) else: w_f.read(4) binary = w_f.read() self.offset = 0 self.all_weights = np.frombuffer(binary, dtype='float32') def read_bytes(self, size): """ Helper function to read bytes from all_weights. """ self.offset = self.offset + size return self.all_weights[self.offset - size:self.offset] def load_weights(self, model): """ Load weights into created model. """ for i in range(106): try: conv_layer = model.get_layer('conv_' + str(i)) print("loading weights of convolution #" + str(i)) if i not in [81, 93, 105]: norm_layer = model.get_layer('bnorm_' + str(i)) size = np.prod(norm_layer.get_weights()[0].shape) beta = self.read_bytes(size) # bias gamma = self.read_bytes(size) # scale mean = self.read_bytes(size) # mean var = self.read_bytes(size) # variance norm_layer.set_weights([gamma, beta, mean, var]) if len(conv_layer.get_weights()) > 1: bias = self.read_bytes(np.prod(conv_layer.get_weights()[1].shape)) kernel = self.read_bytes(np.prod(conv_layer.get_weights()[0].shape)) kernel = kernel.reshape(list(reversed(conv_layer.get_weights()[0].shape))) kernel = kernel.transpose([2,3,1,0]) conv_layer.set_weights([kernel, bias]) else: kernel = self.read_bytes(np.prod(conv_layer.get_weights()[0].shape)) kernel = kernel.reshape(list(reversed(conv_layer.get_weights()[0].shape))) kernel = kernel.transpose([2,3,1,0]) conv_layer.set_weights([kernel]) except ValueError: print("no convolution #" + str(i)) def reset(self): """ Resets offset to restart loading weights. """ self.offset = 0 # Step 2: def _conv_block(input_layer, convs, skip=True): """ Function to create convolutional layer. """ tmp = input_layer count = 0 for conv in convs: if count == (len(convs) - 2) and skip: skip_connection = tmp count += 1 # Peculiar padding as darknet prefer left and top if conv['stride'] > 1: tmp = ZeroPadding2D(((1,0),(1,0)))(tmp) tmp = Conv2D(conv['filter'], conv['kernel'], strides=conv['stride'], # Peculiar padding as darknet prefer left and top padding='valid' if conv['stride'] > 1 else 'same', name='conv_' + str(conv['layer_idx']), use_bias=False if conv['bnorm'] else True)(tmp) if conv['bnorm']: tmp = BatchNormalization(epsilon=0.001, name='bnorm_' + str(conv['layer_idx']))(tmp) if conv['leaky']: tmp = LeakyReLU(alpha=0.1, name='leaky_' + str(conv['layer_idx']))(tmp) return add([skip_connection, tmp]) if skip else tmp def make_yolov3_model(): """ Function to create layers of convoluational and stack together as a whole yolo model. """ input_image = Input(shape=(None, None, 3)) # Layer 0 => 4 tmp = _conv_block(input_image, [{'filter': 32, 'kernel': 3, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 0}, {'filter': 64, 'kernel': 3, 'stride': 2, 'bnorm': True, 'leaky': True, 'layer_idx': 1}, {'filter': 32, 'kernel': 1, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 2}, {'filter': 64, 'kernel': 3, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 3}]) # Layer 5 => 8 tmp = _conv_block(tmp, [{'filter': 128, 'kernel': 3, 'stride': 2, 'bnorm': True, 'leaky': True, 'layer_idx': 5}, {'filter': 64, 'kernel': 1, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 6}, {'filter': 128, 'kernel': 3, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 7}]) # Layer 9 => 11 tmp = _conv_block(tmp, [{'filter': 64, 'kernel': 1, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 9}, {'filter': 128, 'kernel': 3, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 10}]) # Layer 12 => 15 tmp = _conv_block(tmp, [{'filter': 256, 'kernel': 3, 'stride': 2, 'bnorm': True, 'leaky': True, 'layer_idx': 12}, {'filter': 128, 'kernel': 1, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 13}, {'filter': 256, 'kernel': 3, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 14}]) # Layer 16 => 36 for i in range(7): tmp = _conv_block(tmp, [{'filter': 128, 'kernel': 1, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 16+i*3}, {'filter': 256, 'kernel': 3, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 17+i*3}]) skip_36 = tmp # Layer 37 => 40 tmp = _conv_block(tmp, [{'filter': 512, 'kernel': 3, 'stride': 2, 'bnorm': True, 'leaky': True, 'layer_idx': 37}, {'filter': 256, 'kernel': 1, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 38}, {'filter': 512, 'kernel': 3, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 39}]) # Layer 41 => 61 for i in range(7): tmp = _conv_block(tmp, [{'filter': 256, 'kernel': 1, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 41+i*3}, {'filter': 512, 'kernel': 3, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 42+i*3}]) skip_61 = tmp # Layer 62 => 65 tmp = _conv_block(tmp, [{'filter': 1024, 'kernel': 3, 'stride': 2, 'bnorm': True, 'leaky': True, 'layer_idx': 62}, {'filter': 512, 'kernel': 1, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 63}, {'filter': 1024, 'kernel': 3, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 64}]) # Layer 66 => 74 for i in range(3): tmp = _conv_block(tmp, [{'filter': 512, 'kernel': 1, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 66+i*3}, {'filter': 1024, 'kernel': 3, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 67+i*3}]) # Layer 75 => 79 tmp = _conv_block(tmp, [{'filter': 512, 'kernel': 1, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 75}, {'filter': 1024, 'kernel': 3, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 76}, {'filter': 512, 'kernel': 1, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 77}, {'filter': 1024, 'kernel': 3, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 78}, {'filter': 512, 'kernel': 1, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 79}], skip=False) # Layer 80 => 82 yolo_82 = _conv_block(tmp, [{'filter': 1024, 'kernel': 3, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 80}, {'filter': 255, 'kernel': 1, 'stride': 1, 'bnorm': False, 'leaky': False, 'layer_idx': 81}], skip=False) # Layer 83 => 86 tmp = _conv_block(tmp, [{'filter': 256, 'kernel': 1, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 84}], skip=False) tmp = UpSampling2D(2)(tmp) tmp = concatenate([tmp, skip_61]) # Layer 87 => 91 tmp = _conv_block(tmp, [{'filter': 256, 'kernel': 1, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 87}, {'filter': 512, 'kernel': 3, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 88}, {'filter': 256, 'kernel': 1, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 89}, {'filter': 512, 'kernel': 3, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 90}, {'filter': 256, 'kernel': 1, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 91}], skip=False) # Layer 92 => 94 yolo_94 = _conv_block(tmp, [{'filter': 512, 'kernel': 3, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 92}, {'filter': 255, 'kernel': 1, 'stride': 1, 'bnorm': False, 'leaky': False, 'layer_idx': 93}], skip=False) # Layer 95 => 98 tmp = _conv_block(tmp, [{'filter': 128, 'kernel': 1, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 96}], skip=False) tmp = UpSampling2D(2)(tmp) tmp = concatenate([tmp, skip_36]) # Layer 99 => 106 yolo_106 = _conv_block(tmp, [{'filter': 128, 'kernel': 1, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 99}, {'filter': 256, 'kernel': 3, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 100}, {'filter': 128, 'kernel': 1, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 101}, {'filter': 256, 'kernel': 3, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 102}, {'filter': 128, 'kernel': 1, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 103}, {'filter': 256, 'kernel': 3, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 104}, {'filter': 255, 'kernel': 1, 'stride': 1, 'bnorm': False, 'leaky': False, 'layer_idx': 105}], skip=False) model = Model(input_image, [yolo_82, yolo_94, yolo_106]) return model # Step 4: # Prediction def load_image_pixels(filename, shape): """ Loading the image to model and make prediction """ # Load image to get its shape image = load_img(filename) width, height = image.size # Load image with required size image = load_img(filename, target_size=shape) image = img_to_array(image) # Grayscale image normalization image = image.astype('float32') image /= 255.0 # Add a dimension so that we have one sample image = expand_dims(image, 0) return image, width, height # Step 4: Decode the prediction output to rectangle coordinates class BoundBox: """ BoundBox class is used to return object bounding box coordinates, object name and threshold score decode_netout` function is used to decode the prediction output to rectangle coordinates """ def __init__(self, xmin, ymin, xmax, ymax, objness = None, classes = None): self.xmin = xmin self.ymin = ymin self.xmax = xmax self.ymax = ymax self.objness = objness self.classes = classes self.label = -1 self.score = -1 def get_label(self): """ Gets the label of the current object """ if self.label == -1: self.label = np.argmax(self.classes) return self.label def get_score(self): """ Gets the score of the current object """ if self.score == -1: self.score = self.classes[self.get_label()] return self.get_score def _sigmoid(inp): return 1. / (1. + np.exp(-inp)) def decode_netout(netout, anchors, obj_thresh, net_h, net_w): """ Decode output information of network. """ grid_h, grid_w = netout.shape[:2] nb_box = 3 netout = netout.reshape((grid_h, grid_w, nb_box, -1)) boxes = [] netout[..., :2] = _sigmoid(netout[..., :2]) netout[..., 4:] = _sigmoid(netout[..., 4:]) netout[..., 5:] = netout[..., 4][..., np.newaxis] * netout[..., 5:] netout[..., 5:] *= netout[..., 5:] > obj_thresh for i in range(grid_h * grid_w): row = i / grid_w col = i % grid_w for j in range(nb_box): # 4th element is objectness score objectness = netout[int(row)][int(col)][j][4] if objectness.all() <= obj_thresh: continue # First 4 elements are x, y, w, and h x, y, w, h = netout[int(row)][int(col)][j][:4] x = (col + x) / grid_w # Center position, unit: image width y = (row + y) / grid_h # Center position, unit: image height w = anchors[2 * j + 0] * np.exp(w) / net_w # Unit: image width h = anchors[2 * j + 1] * np.exp(h) / net_h # Unit: image height # Last elements are class probabilities classes = netout[int(row)][col][j][5:] box = BoundBox(x - w / 2, y - h / 2, x + w / 2, y + h / 2, objectness, classes) boxes.append(box) return boxes # Step 5 def correct_yolo_boxes(boxes, image_h, image_w, net_h, net_w): """ Strech the box to be fit to the image normal shape """ new_w, new_h = net_w, net_h for box in boxes: x_offset, x_scale = (net_w - new_w) / 2. / net_w, float(new_w) / net_w y_offset, y_scale = (net_h - new_h) / 2. / net_h, float(new_h) / net_h box.xmin = int((box.xmin - x_offset) / x_scale * image_w) box.xmax = int((box.xmax - x_offset) / x_scale * image_w) box.ymin = int((box.ymin - y_offset) / y_scale * image_h) box.ymax = int((box.ymax - y_offset) / y_scale * image_h) # Step 6 def _interval_overlap(interval_a, interval_b): """ Implementing IOU """ x1, x2 = interval_a x3, x4 = interval_b if x3 < x1: if x4 < x1: return 0 else: return min(x2,x4) - x1 else: if x2 < x3: return 0 else: return min(x2,x4) - x3 def bbox_iou(box1, box2): intersect_w = _interval_overlap([box1.xmin, box1.xmax], [box2.xmin, box2.xmax]) intersect_h = _interval_overlap([box1.ymin, box1.ymax], [box2.ymin, box2.ymax]) intersect = intersect_w * intersect_h w1, h1 = box1.xmax-box1.xmin, box1.ymax-box1.ymin w2, h2 = box2.xmax-box2.xmin, box2.ymax-box2.ymin union = w1*h1 + w2*h2 - intersect return float(intersect) / union def do_nms(boxes, nms_thresh): if len(boxes) > 0: nb_class = len(boxes[0].classes) else: return for c in range(nb_class): sorted_indices = np.argsort([-box.classes[c] for box in boxes]) for i in range(len(sorted_indices)): index_i = sorted_indices[i] if boxes[index_i].classes[c] == 0: continue for j in range(i+1, len(sorted_indices)): index_j = sorted_indices[j] if bbox_iou(boxes[index_i], boxes[index_j]) >= nms_thresh: boxes[index_j].classes[c] = 0 # get all of the results above a threshold def get_boxes(boxes, labels, thresh): v_boxes, v_labels, v_scores = list(), list(), list() # enumerate all boxes for box in boxes: # enumerate all possible labels for i in range(len(labels)): # check if the threshold for this label is high enough if box.classes[i] > thresh: v_boxes.append(box) v_labels.append(labels[i]) v_scores.append(box.classes[i]*100) # don't break, many labels may trigger for one box return v_boxes, v_labels, v_scores # draw all results def draw_boxes(filename, v_boxes, v_labels, v_scores): # load the image data = pyplot.imread(filename) # plot the image pyplot.imshow(data) # get the context for drawing boxes ax = pyplot.gca() # plot each box for i in range(len(v_boxes)): box = v_boxes[i] # get coordinates y1, x1, y2, x2 = box.ymin, box.xmin, box.ymax, box.xmax # calculate width and height of the box width, height = x2 - x1, y2 - y1 # create the shape rect = Rectangle((x1, y1), width, height, fill=False, color='red', linewidth = '2') # draw the box ax.add_patch(rect) # draw text and score in top left corner label = "%s (%.3f)" % (v_labels[i], v_scores[i]) pyplot.text(x1, y1, label, color='red') # show the plot pyplot.show() """**step 7:** declare several configuration""" # Define the anchors ANCHORS = [[116,90, 156,198, 373,326], [30,61, 62,45, 59,119], [10,13, 16,30, 33,23]] # Define the probability threshold for detected objects CLASS_THRESHOLD = 0.6 # Define the labels LABELS = ["person", "bicycle", "car", "motorbike", "aeroplane", "bus", "train", "truck", "boat", "traffic light", "fire hydrant", "stop sign", "parking meter", "bench", "bird", "cat", "dog", "horse", "sheep", "cow", "elephant", "bear", "zebra", "giraffe", "backpack", "umbrella", "handbag", "tie", "suitcase", "frisbee", "skis", "snowboard", "sports ball", "kite", "baseball bat", "baseball glove", "skateboard", "surfboard", "tennis racket", "bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl", "banana", "apple", "sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair", "sofa", "pottedplant", "bed", "diningtable", "toilet", "tvmonitor", "laptop", "mouse", "remote", "keyboard", "cell phone", "microwave", "oven", "toaster", "sink", "refrigerator", "book", "clock", "vase", "scissors", "teddy bear", "hair drier", "toothbrush"] def main(): """ Defined starting point of source code. """ # Step 3: # (1) Define the model # (2) Load the weight # (3) Save the model # Define the YOLO v3 model yolov3 = make_yolov3_model() print(yolov3.summary()) # Load the weights # Source: https://pjreddie.com/media/files/yolov3.weights weight_reader = WeightReader('yolov3.weights') # Set the weights weight_reader.load_weights(yolov3) # Save the model to file yolov3.save('yolov3.h5') # Step 8: # Make Prediction for photo_filename in glob.glob("images/test/dog/*"): # for fn in upload.keys(): # photo_filename = '/content/' + fn # photo_filename = 'test.jpg' # define the expected input shape for the model input_w, input_h = 416, 416 image, image_w, image_h = load_image_pixels(photo_filename, (input_w, input_h)) # make prediction yhat = yolov3.predict(image) # summarize the shape of the list of arrays print([a.shape for a in yhat]) boxes = list() for i in range(len(yhat)): # decode the output of the network boxes += decode_netout(yhat[i][0], ANCHORS[i], CLASS_THRESHOLD, input_h, input_w) # correct the sizes of the bounding boxes for the shape of the image correct_yolo_boxes(boxes, image_h, image_w, input_h, input_w) # suppress non-maximal boxes do_nms(boxes, 0.5) # get the details of the detected objects v_boxes, v_labels, v_scores = get_boxes(boxes, LABELS, CLASS_THRESHOLD) # summarize what we found for i in range(len(v_boxes)): print(v_labels[i], v_scores[i]) # draw what we found draw_boxes(photo_filename, v_boxes, v_labels, v_scores) print([a.shape for a in yhat]) if __name__ == "__main__": main()