The YOLO Neural Network architecture was firstly published in the 2015 but from the first version many improvements were made and now we have the third version of it.
The network structure can be broadly summarize as a simple CNN and its output is generated by applying a series of three different detection 1 × 1 kernel on the feature map. Moreover, this detection was performed in three different places in the network, i.e three YOLO detection layers are distributed along the network structure. The shape of the detection kernel is 1 × 1 × (B × (5 + C)) , where B is the number of bounding boxes a cell on the feature map can predict and C is the number of classes. The fixed number ('5') is given by 4 bounding box attributes plus one object confidence coefficient (the so-called objectness into the code).
For further informations about he YOLO layer, here's the Original paper.
This is the forward function of the layer:
def forward(self, inpt, truth, net_shape, index):
'''
'''
# NOTE: The input (probably) should be given in (c, h, w) fmt
# y_pred is the previous output thus the input
################## TEMPORARY SOLUTION
# just a simple 1D convolution
self.output = inpt.copy().ravel()
size_delta = np.prod(self.out_shape)
size = np.prod(self.out_shape[1:]) // self.nb_box
for i in range(size_delta):
index = i % size
if index >= wh2 and index < whcoords: continue
else: 1. / (1. + np.exp(-self.ouput[i]))
self.delta = np.zeros(shape=self.out_shape, dtype=float)
if not self.trainable:
return
# compute grid factor and net factor
self.grid_w, self.grid_h = truth.shape[:2]
grid_factor = np.array([self.grid_w, self.grid_h], dtype=float).reshape((1, 1, 1, 1, 2))
net_w, net_h = net_shape
net_factor = np.array([net_w, net_h], dtype=float).reshape((1, 1, 1, 1, 2))
# Adjust prediction
pred_box_xy = (self.cell_grid[:, :grid_h, :grid_w, :, :] + 1. / (1. + np.exp(y_pred[..., :2])))
pred_box_wh = y_pred[..., 2 : 4]
pred_box_conf = np.expand_dims(1. / (1. + np.exp(y_pred[..., 4])), axis=4)
pred_box_class = y_pred[..., 5:]
# Adjust ground truth
true_box_xy = y_true[..., 0 : 2]
true_box_wh = y_true[..., 2 : 4]
true_box_conf = np.expand_dims(y_true[..., 4], axis=4)
true_box_class = np.argmax(y_true[..., 5:], axis=-1)
# Compare each predicted box to all true boxes
# initially, drag all objectness of all boxes to 0
conf_delta = pred_box_conf - 0
# then, ignore the boxes which have good overlap with some true box
true_xy = true_boxes[..., 0 : 2] / grid_factor
true_wh = true_boxes[..., 2 : 4] / net_factor
# NOTE: probably in our configuration (x, y, w, h) we do not need to center boxes in the following way
true_wh_half = true_wh * .5
true_mins = true_xy - true_wh_half
true_maxes = true_xy + true_wh_half
pred_xy = np.expand_dims(pred_box_xy / grid_factor, axis=4)
pred_wh = np.expand_dims(np.exp(pred_box_wh) * self.anchors / net_factor, axis=4)
pred_wh_half = pred_wh * .5
pred_mins = pred_xy - pred_wh_half
pred_maxes = pred_xy + pred_wh_half
intersect_mins = np.maximum(pred_mins, true_mins)
intersect_maxes = np.minimum(pred_maxes, true_maxes)
intersect_wh = np.maximum(intersect_maxes - intersect_mins, 0.)
intersect_areas = intersect_wh[..., 0] * intersect_wh[..., 1]
true_areas = true_wh[..., 0] * true_wh[..., 1]
pred_areas = pred_wh[..., 0] * pred_wh[..., 1]
union_areas = pred_areas + true_areas - intersect_areas
iou_scores = np.divide(intersect_areas, union_areas)
best_ious = np.max(iou_scores, axis=4)
conf_delta *= np.expand_dims(best_ious < self.ignore_thresh, axis=4)
# Compute some online statistics
true_xy = true_box_xy / grid_factor
true_wh = np.exp(true_box_wh) * self.anchors / net_factor
# the same possible troubles with the centering of boxes
true_wh_half = true_wh * .5
true_mins = true_xy - true_wh_half
true_maxes = true_xy + true_wh_half
pred_xy = pred_box_xy / grid_factor
pred_wh = np.exp(pred_box_wh) * self.anchors / net_factor
pred_wh_half = pred_wh * .5
pred_mins = pred_xy - pred_wh_half
pred_maxes = pred_xy + pred_wh_half
intersect_mins = np.maximum(pred_mins, true_mins)
intersect_maxes = np.minimum(pred_maxes, true_maxes)
intersect_wh = np.maximum(intersect_maxes - intersect_mins, 0.)
intersect_areas = intersect_wh[..., 0] * intersect_wh[..., 1]
true_areas = true_wh[..., 0] * true_wh[..., 1]
pred_areas = pred_wh[..., 0] * pred_wh[..., 1]
union_areas = pred_areas + true_areas - intersect_areas
iou_scores = np.divide(intersect_areas, union_areas)
iou_scores = object_mask * np.expand_dims(iou_scores, axis=4)
count = np.sum(object_mask)
count_noobj = np.sum(1. - object_mask)
detect_mask = pred_box_conf * object_mask >= .5
class_mask = np.expand_dims(pred_box_class.argmax(axis=-1) == true_box_class, axis=4)
recall50 = np.sum( ((iou_scores >= .5) * detect_mask * class_mask) / (count + 1e-3) )
recall50 = np.sum( ((iou_scores >= .75) * detect_mask * class_mask) / (count + 1e-3) )
avg_iou = np.sum(iou_scores) / (count + 1e-3)
avg_obj = np.sum(pred_box_conf * object_mask) / (count + 1e-3)
avg_noobj = np.sum(pred_box_conf * (1. - object_mask)) / (count_noobj + 1e-3)
avg_cat = np.sum(object_mask * class_mask) / (count + 1e-3)
# # Warm-up training
# # MISS (line 149-157)
# # Compare each true box to all anchor boxes
# wh_scale = np.exp(true_box_wh) * self.anchors / net_factor
# wh_scale = np.expand_dims(2 - wh_scale[..., 0] * wh_scale[..., 1], axis=4) # the smaller the box, the bigger the scale
# xy_delta = xywh_mask * (pred_box_xy - true_box_xy) * wh_scale * self.xywh_scale
# wh_delta = xywh_mask * (pred_box_wh - true_box_wh) * wh_scale * self.xywh_scale
# conf_delta = object_mask * (pred_box_conf - true_box_conf) * self.obj_scale + (1 - object_mask) * conf_delta * self.noobj_scale
# class_delta = object_mask * * self.class_scale # MISS (line 168)
# loss_xy = np.sum(xy_delta * xy_delta, axis=tuple(range(1, 5)))
# loss_wh = np.sum(wh_delta * wh_delta, axis=tuple(range(1, 5)))
# loss_conf = np.sum(conf_delta * conf_delta, axis=tuple(range(1, 5)))
# loss_class = np.sum(class_delta, axis=tuple(range(1, 5)))
# loss = loss_xy + loss_wh + loss_conf + loss_class
print('Yolo {:d} Avg IOU: {:.3f}, Class: {:.3f}, Obj: {:.3f}, No Obj: {:.3f}, .5R: {:.3f}, .75R: {:.3f}, count: {:d}'.format(
index, avg_iou, avg_cat, avg_obj, avg_noobj, recall50, recall75, count))
self.delta = np.zeros(shape=self.out_shape, dtype=float)
That is highly inspired by the keras version of the layer. Note that the implementation is only partial for now.
While the backward
function simply transfers back self.delta
, computed in forward
:
def backward(self, delta):
'''
'''
delta[:] += self.delta