forked from JJBOY/BMN-Boundary-Matching-Network
-
Notifications
You must be signed in to change notification settings - Fork 0
/
dataset.py
127 lines (110 loc) · 5.75 KB
/
dataset.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
# -*- coding: utf-8 -*-
import numpy as np
import pandas as pd
import json
import torch.utils.data as data
import torch
from utils import ioa_with_anchors, iou_with_anchors
def load_json(file):
with open(file) as json_file:
json_data = json.load(json_file)
return json_data
class VideoDataSet(data.Dataset):
def __init__(self, opt, subset="train"):
self.temporal_scale = opt["temporal_scale"] # 100
self.temporal_gap = 1. / self.temporal_scale
self.subset = subset
self.mode = opt["mode"]
self.feature_path = opt["feature_path"]
self.video_info_path = opt["video_info"]
self.video_anno_path = opt["video_anno"]
self._getDatasetDict()
self.anchor_xmin = [self.temporal_gap * (i - 0.5) for i in range(self.temporal_scale)]
self.anchor_xmax = [self.temporal_gap * (i + 0.5) for i in range(self.temporal_scale)]
def _getDatasetDict(self):
anno_df = pd.read_csv(self.video_info_path)
anno_database = load_json(self.video_anno_path)
self.video_dict = {}
for i in range(len(anno_df)):
video_name = anno_df.video.values[i]
video_info = anno_database[video_name]
video_subset = anno_df.subset.values[i]
if self.subset in video_subset:
self.video_dict[video_name] = video_info
self.video_list = list(self.video_dict.keys())
print("%s subset video numbers: %d" % (self.subset, len(self.video_list)))
def __getitem__(self, index):
video_data = self._load_file(index)
if self.mode == "train":
match_score_start, match_score_end, confidence_score = self._get_train_label(index, self.anchor_xmin,
self.anchor_xmax)
return video_data, confidence_score, match_score_start, match_score_end
else:
return index, video_data
def _load_file(self, index):
video_name = self.video_list[index]
video_df = pd.read_csv(self.feature_path + "csv_mean_" + str(self.temporal_scale) + "/" + video_name + ".csv")
video_data = video_df.values[:, :]
video_data = torch.Tensor(video_data)
video_data = torch.transpose(video_data, 0, 1)
video_data.float()
return video_data
def _get_train_label(self, index, anchor_xmin, anchor_xmax):
video_name = self.video_list[index]
video_info = self.video_dict[video_name]
video_frame = video_info['duration_frame']
video_second = video_info['duration_second']
feature_frame = video_info['feature_frame']
corrected_second = float(feature_frame) / video_frame * video_second # there are some frames not used
video_labels = video_info['annotations'] # the measurement is second, not frame
##############################################################################################
# change the measurement from second to percentage
gt_bbox = []
gt_iou_map = []
for j in range(len(video_labels)):
tmp_info = video_labels[j]
tmp_start = max(min(1, tmp_info['segment'][0] / corrected_second), 0)
tmp_end = max(min(1, tmp_info['segment'][1] / corrected_second), 0)
gt_bbox.append([tmp_start, tmp_end])
####################################################################################################
# generate R_s and R_e
gt_bbox = np.array(gt_bbox)
gt_xmins = gt_bbox[:, 0]
gt_xmaxs = gt_bbox[:, 1]
gt_lens = gt_xmaxs - gt_xmins
gt_len_small = 3 * self.temporal_gap # np.maximum(self.temporal_gap, self.boundary_ratio * gt_lens)
gt_start_bboxs = np.stack((gt_xmins - gt_len_small / 2, gt_xmins + gt_len_small / 2), axis=1)
gt_end_bboxs = np.stack((gt_xmaxs - gt_len_small / 2, gt_xmaxs + gt_len_small / 2), axis=1)
#####################################################################################################
gt_iou_map = np.zeros([self.temporal_scale, self.temporal_scale])
for i in range(self.temporal_scale):
for j in range(i, self.temporal_scale):
gt_iou_map[i, j] = np.max(
iou_with_anchors(i * self.temporal_gap, (j + 1) * self.temporal_gap, gt_xmins, gt_xmaxs))
gt_iou_map = torch.Tensor(gt_iou_map)
##########################################################################################################
# calculate the ioa for all timestamp
match_score_start = []
for jdx in range(len(anchor_xmin)):
match_score_start.append(np.max(
ioa_with_anchors(anchor_xmin[jdx], anchor_xmax[jdx], gt_start_bboxs[:, 0], gt_start_bboxs[:, 1])))
match_score_end = []
for jdx in range(len(anchor_xmin)):
match_score_end.append(np.max(
ioa_with_anchors(anchor_xmin[jdx], anchor_xmax[jdx], gt_end_bboxs[:, 0], gt_end_bboxs[:, 1])))
match_score_start = torch.Tensor(match_score_start)
match_score_end = torch.Tensor(match_score_end)
############################################################################################################
return match_score_start, match_score_end, gt_iou_map
def __len__(self):
return len(self.video_list)
if __name__ == '__main__':
import opts
opt = opts.parse_opt()
opt = vars(opt)
train_loader = torch.utils.data.DataLoader(VideoDataSet(opt, subset="train"),
batch_size=opt["batch_size"], shuffle=True,
num_workers=8, pin_memory=True)
for a, b, c, d in train_loader:
print(a.shape, b.shape, c.shape, d.shape)
break