Skip to content

Latest commit

 

History

History
168 lines (121 loc) · 6.85 KB

yolo_layer.md

File metadata and controls

168 lines (121 loc) · 6.85 KB

YOLO Layer

The YOLO Neural Network architecture was firstly published in the 2015 but from the first version many improvements were made and now we have the third version of it.

The network structure can be broadly summarize as a simple CNN and its output is generated by applying a series of three different detection 1 × 1 kernel on the feature map. Moreover, this detection was performed in three different places in the network, i.e three YOLO detection layers are distributed along the network structure. The shape of the detection kernel is 1 × 1 × (B × (5 + C)) , where B is the number of bounding boxes a cell on the feature map can predict and C is the number of classes. The fixed number ('5') is given by 4 bounding box attributes plus one object confidence coefficient (the so-called objectness into the code).

For further informations about he YOLO layer, here's the Original paper.

This is the forward function of the layer:

def forward(self, inpt, truth, net_shape, index):
	'''
	'''
	# NOTE: The input (probably) should be given in (c, h, w) fmt

	# y_pred is the previous output thus the input

	################## TEMPORARY SOLUTION

	# just a simple 1D convolution
	self.output = inpt.copy().ravel()
	size_delta = np.prod(self.out_shape)
	size = np.prod(self.out_shape[1:]) // self.nb_box

	for i in range(size_delta):
		index = i % size
		if index >= wh2 and index < whcoords: continue
		else:                                 1. / (1. + np.exp(-self.ouput[i]))

	self.delta = np.zeros(shape=self.out_shape, dtype=float)

	if not self.trainable:
		return

	# compute grid factor and net factor
	self.grid_w, self.grid_h = truth.shape[:2]
	grid_factor = np.array([self.grid_w, self.grid_h], dtype=float).reshape((1, 1, 1, 1, 2))

	net_w, net_h = net_shape
	net_factor   = np.array([net_w, net_h], dtype=float).reshape((1, 1, 1, 1, 2))

	# Adjust prediction
	pred_box_xy    = (self.cell_grid[:, :grid_h, :grid_w, :, :] + 1. / (1. + np.exp(y_pred[..., :2])))
	pred_box_wh    = y_pred[..., 2 : 4]
	pred_box_conf  = np.expand_dims(1. / (1. + np.exp(y_pred[..., 4])), axis=4)
	pred_box_class = y_pred[..., 5:]

	# Adjust ground truth
	true_box_xy    = y_true[..., 0 : 2]
	true_box_wh    = y_true[..., 2 : 4]
	true_box_conf  = np.expand_dims(y_true[..., 4], axis=4)
	true_box_class = np.argmax(y_true[..., 5:], axis=-1)


	# Compare each predicted box to all true boxes

	# initially, drag all objectness of all boxes to 0
	conf_delta = pred_box_conf - 0

	# then, ignore the boxes which have good overlap with some true box
	true_xy = true_boxes[..., 0 : 2] / grid_factor
	true_wh = true_boxes[..., 2 : 4] / net_factor

	# NOTE: probably in our configuration (x, y, w, h) we do not need to center boxes in the following way
	true_wh_half = true_wh * .5
	true_mins    = true_xy - true_wh_half
	true_maxes   = true_xy + true_wh_half

	pred_xy = np.expand_dims(pred_box_xy / grid_factor, axis=4)
	pred_wh = np.expand_dims(np.exp(pred_box_wh) * self.anchors / net_factor, axis=4)

	pred_wh_half = pred_wh * .5
	pred_mins    = pred_xy - pred_wh_half
	pred_maxes   = pred_xy + pred_wh_half

	intersect_mins  = np.maximum(pred_mins,  true_mins)
	intersect_maxes = np.minimum(pred_maxes, true_maxes)

	intersect_wh    = np.maximum(intersect_maxes - intersect_mins, 0.)
	intersect_areas = intersect_wh[..., 0] * intersect_wh[..., 1]

	true_areas  = true_wh[..., 0] * true_wh[..., 1]
	pred_areas  = pred_wh[..., 0] * pred_wh[..., 1]

	union_areas = pred_areas + true_areas - intersect_areas
	iou_scores  = np.divide(intersect_areas, union_areas)

	best_ious   = np.max(iou_scores, axis=4)
	conf_delta *= np.expand_dims(best_ious < self.ignore_thresh, axis=4)

	# Compute some online statistics

	true_xy = true_box_xy / grid_factor
	true_wh = np.exp(true_box_wh) * self.anchors / net_factor

	# the same possible troubles with the centering of boxes
	true_wh_half = true_wh * .5
	true_mins    = true_xy - true_wh_half
	true_maxes   = true_xy + true_wh_half

	pred_xy = pred_box_xy / grid_factor
	pred_wh = np.exp(pred_box_wh) * self.anchors / net_factor

	pred_wh_half = pred_wh * .5
	pred_mins    = pred_xy - pred_wh_half
	pred_maxes   = pred_xy + pred_wh_half

	intersect_mins  = np.maximum(pred_mins, true_mins)
	intersect_maxes = np.minimum(pred_maxes, true_maxes)
	intersect_wh    = np.maximum(intersect_maxes - intersect_mins, 0.)
	intersect_areas = intersect_wh[..., 0] * intersect_wh[..., 1]

	true_areas = true_wh[..., 0] * true_wh[..., 1]
	pred_areas = pred_wh[..., 0] * pred_wh[..., 1]

	union_areas = pred_areas + true_areas - intersect_areas
	iou_scores = np.divide(intersect_areas, union_areas)
	iou_scores = object_mask * np.expand_dims(iou_scores, axis=4)

	count = np.sum(object_mask)
	count_noobj = np.sum(1. - object_mask)
	detect_mask = pred_box_conf * object_mask >= .5
	class_mask = np.expand_dims(pred_box_class.argmax(axis=-1) == true_box_class, axis=4)

	recall50 = np.sum( ((iou_scores >= .5)  * detect_mask * class_mask) / (count + 1e-3) )
	recall50 = np.sum( ((iou_scores >= .75) * detect_mask * class_mask) / (count + 1e-3) )
	avg_iou = np.sum(iou_scores) / (count + 1e-3)
	avg_obj = np.sum(pred_box_conf * object_mask) / (count + 1e-3)
	avg_noobj = np.sum(pred_box_conf * (1. - object_mask)) / (count_noobj + 1e-3)
	avg_cat = np.sum(object_mask * class_mask) / (count + 1e-3)

	# #  Warm-up training
	# # MISS (line 149-157)

	# # Compare each true box to all anchor boxes
	# wh_scale = np.exp(true_box_wh) * self.anchors / net_factor
	# wh_scale = np.expand_dims(2 - wh_scale[..., 0] * wh_scale[..., 1], axis=4) # the smaller the box, the bigger the scale

	# xy_delta    = xywh_mask   * (pred_box_xy   - true_box_xy) * wh_scale * self.xywh_scale
	# wh_delta    = xywh_mask   * (pred_box_wh   - true_box_wh) * wh_scale * self.xywh_scale
	# conf_delta  = object_mask * (pred_box_conf - true_box_conf) * self.obj_scale + (1 - object_mask) * conf_delta * self.noobj_scale
	# class_delta = object_mask *   * self.class_scale # MISS (line 168)

	# loss_xy    = np.sum(xy_delta * xy_delta,     axis=tuple(range(1, 5)))
	# loss_wh    = np.sum(wh_delta * wh_delta,     axis=tuple(range(1, 5)))
	# loss_conf  = np.sum(conf_delta * conf_delta, axis=tuple(range(1, 5)))
	# loss_class = np.sum(class_delta,             axis=tuple(range(1, 5)))

	# loss = loss_xy + loss_wh + loss_conf + loss_class

	print('Yolo {:d} Avg IOU: {:.3f}, Class: {:.3f}, Obj: {:.3f}, No Obj: {:.3f}, .5R: {:.3f}, .75R: {:.3f}, count: {:d}'.format(
				index, avg_iou, avg_cat, avg_obj, avg_noobj, recall50, recall75, count))

	self.delta = np.zeros(shape=self.out_shape, dtype=float)

That is highly inspired by the keras version of the layer. Note that the implementation is only partial for now.

While the backward function simply transfers back self.delta, computed in forward:

def backward(self, delta):
	'''
	'''
	delta[:] += self.delta