-
Notifications
You must be signed in to change notification settings - Fork 37
/
Copy pathvideo2tfrecord.py
403 lines (328 loc) · 15.3 KB
/
video2tfrecord.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
#!/usr/bin/env python
"""Easily convert RGB video data (e.g. .avi) to the TensorFlow tfrecords file format with the provided 3 color channels.
Allows to subsequently train a neural network in TensorFlow with the generated tfrecords.
Due to common hardware/GPU RAM limitations, this implementation allows to limit the number of frames per
video actually stored in the tfrecords. The code automatically chooses the frame step size such that there is
an equal separation distribution of the video images. Implementation supports Optical Flow
(currently OpenCV's calcOpticalFlowFarneback) as an additional 4th channel.
"""
from tensorflow.python.platform import gfile
from tensorflow.python.platform import flags
from tensorflow.python.platform import app
import cv2 as cv2
import numpy as np
import math
import os
import tensorflow as tf
import time
FLAGS = flags.FLAGS
flags.DEFINE_integer('n_videos_in_record', 10,
'Number of videos stored in one single tfrecord file')
flags.DEFINE_string('image_color_depth', "uint8",
'Color depth as string for the images stored in the tfrecord files. '
'Has to correspond to the source video color depth. '
'Specified as dtype (e.g. uint8 or uint16)')
flags.DEFINE_string('file_suffix', "*.mp4",
'defines the video file type, e.g. .mp4')
flags.DEFINE_string('source', './example/input', 'Directory with video files')
flags.DEFINE_string('destination', './example/output',
'Directory for storing tf records')
flags.DEFINE_boolean('optical_flow', True,
'Indicates whether optical flow shall be computed and added as fourth '
'channel.')
flags.DEFINE_integer('width_video', 1280, 'the width of the videos in pixels')
flags.DEFINE_integer('height_video', 720, 'the height of the videos in pixels')
flags.DEFINE_integer('n_frames_per_video', 5,
'specifies the number of frames to be taken from each video')
flags.DEFINE_integer('n_channels', 4,
'specifies the number of channels the videos have')
flags.DEFINE_string('video_filenames', None,
'specifies the video file names as a list in the case the video paths shall not be determined by the '
'script')
def _int64_feature(value):
return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))
def _bytes_feature(value):
return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))
def get_chunks(l, n):
"""Yield successive n-sized chunks from l.
Used to create n sublists from a list l"""
for i in range(0, len(l), n):
yield l[i:i + n]
def get_video_capture_and_frame_count(path):
assert os.path.isfile(
path), "Couldn't find video file:" + path + ". Skipping video."
cap = None
if path:
cap = cv2.VideoCapture(path)
assert cap is not None, "Couldn't load video capture:" + path + ". Skipping video."
# compute meta data of video
if hasattr(cv2, 'cv'):
frame_count = int(cap.get(cv2.cv.CAP_PROP_FRAME_COUNT))
else:
frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
return cap, frame_count
def get_next_frame(cap):
ret, frame = cap.read()
if not ret:
return None
return np.asarray(frame)
def compute_dense_optical_flow(prev_image, current_image):
old_shape = current_image.shape
prev_image_gray = cv2.cvtColor(prev_image, cv2.COLOR_BGR2GRAY)
current_image_gray = cv2.cvtColor(current_image, cv2.COLOR_BGR2GRAY)
assert current_image.shape == old_shape
hsv = np.zeros_like(prev_image)
hsv[..., 1] = 255
flow = None
flow = cv2.calcOpticalFlowFarneback(prev=prev_image_gray,
next=current_image_gray, flow=flow,
pyr_scale=0.8, levels=15, winsize=5,
iterations=10, poly_n=5, poly_sigma=0,
flags=10)
mag, ang = cv2.cartToPolar(flow[..., 0], flow[..., 1])
hsv[..., 0] = ang * 180 / np.pi / 2
hsv[..., 2] = cv2.normalize(mag, None, 0, 255, cv2.NORM_MINMAX)
return cv2.cvtColor(hsv, cv2.COLOR_HSV2BGR)
def convert_videos_to_tfrecord(source_path, destination_path,
n_videos_in_record=10, n_frames_per_video='all',
file_suffix="*.mp4", dense_optical_flow=True,
width=1280, height=720,
color_depth="uint8", video_filenames=None):
"""Starts the process of converting video files to tfrecord files. If
dense_optical_flow is set to True, the number of video channels in the
tfrecords will automatically 4, i.e. the pipeline assumes 3 (RGB) channels
in the videos. This pipeline does not (yet) support a different number of
channels.
Args:
source_path: directory where video videos are stored
destination_path: directory where tfrecords should be stored
n_videos_in_record: Number of videos stored in one single tfrecord file
n_frames_per_video: integer value of string. Specifies the number of frames extracted from each video. If set to 'all', all frames are extracted from the
videos and stored in the tfrecord. If the number is lower than the number of available frames, the subset of extracted frames will be selected equally
spaced over the entire video playtime.
file_suffix: defines the video file type, e.g. *.mp4
dense_optical_flow: boolean flag that controls if optical flow should be
used and added to tfrecords
width: the width of the videos in pixels
height: the height of the videos in pixels
color_depth: Color depth as string for the images stored in the tfrecord
files. Has to correspond to the source video color depth. Specified as
dtype (e.g. uint8 or uint16)
video_filenames: specify, if the the full paths to the videos can be
directly be provided. In this case, the source will be ignored.
"""
assert isinstance(n_frames_per_video, (int, str))
jpeg_encode = True if not dense_optical_flow else False
if type(n_frames_per_video) is str:
assert n_frames_per_video == "all"
if dense_optical_flow:
n_channels = 4
else:
n_channels = 3
if video_filenames is not None:
filenames = video_filenames
else:
filenames = gfile.Glob(os.path.join(source_path, file_suffix))
if not filenames:
raise RuntimeError('No data files found.')
print('Total videos found: ' + str(len(filenames)))
filenames_split = list(get_chunks(filenames, n_videos_in_record))
data = None
for i, batch in enumerate(filenames_split):
if data is not None:
data = None
data = convert_video_to_numpy(filenames=batch, width=width, height=height,
n_frames_per_video=n_frames_per_video,
n_channels=n_channels,
dense_optical_flow=dense_optical_flow)
if n_videos_in_record > len(filenames):
total_batch_number = 1
else:
total_batch_number = int(math.ceil(len(filenames) / n_videos_in_record))
print('Batch ' + str(i + 1) + '/' + str(total_batch_number) + " completed")
assert data.size != 0, 'something went wrong during video to numpy conversion'
save_numpy_to_tfrecords(data, destination_path, 'batch_',
n_videos_in_record, i + 1, total_batch_number,
color_depth=color_depth, jpeg_encode=jpeg_encode)
def save_numpy_to_tfrecords(data, destination_path, name, fragmentSize,
current_batch_number, total_batch_number,
color_depth, jpeg_encode=True):
"""Converts an entire dataset into x tfrecords where x=videos/fragmentSize.
Args:
data: ndarray(uint8) of shape (v,i,h,w,c) with v=number of videos,
i=number of images, c=number of image channels, h=image height, w=image
width
name: filename; data samples type (train|valid|test)
fragmentSize: specifies how many videos are stored in one tfrecords file
current_batch_number: indicates the current batch index (function call within loop)
total_batch_number: indicates the total number of batches
jpeg_encode: specify how to encode the video frames
"""
num_videos = data.shape[0]
num_images = data.shape[1]
num_channels = data.shape[4]
height = data.shape[2]
width = data.shape[3]
writer = None
feature = {}
for video_count in range((num_videos)):
if video_count % fragmentSize == 0:
if writer is not None:
writer.close()
filename = os.path.join(destination_path,
name + str(current_batch_number) + '_of_' + str(
total_batch_number) + '.tfrecords')
print('Writing', filename)
if tf.__version__.split('.')[0] == '2':
writer = tf.io.TFRecordWriter(filename)
else:
writer = tf.python_io.TFRecordWriter(filename)
for image_count in range(num_images):
path = 'blob' + '/' + str(image_count)
image = data[video_count, image_count, :, :, :]
image = image.astype(color_depth)
if jpeg_encode:
image_raw = tf.image.encode_jpeg(image).numpy()
else:
image_raw = image.tostring()
feature[path] = _bytes_feature(image_raw)
feature['height'] = _int64_feature(height)
feature['width'] = _int64_feature(width)
feature['depth'] = _int64_feature(num_channels)
example = tf.train.Example(features=tf.train.Features(feature=feature))
writer.write(example.SerializeToString())
if writer is not None:
writer.close()
def repeat_image_retrieval(cap, file_path, video, take_all_frames, steps, frame,
prev_frame_none, frames_counter):
stop = False
if frame and prev_frame_none or steps <= 0:
stop = True
return stop, cap, video, steps, prev_frame_none, frames_counter
if not take_all_frames:
# repeat with smaller step size
steps -= 1
prev_frame_none = True
print("reducing step size due to error for video: ", file_path)
frames_counter = 0
cap.release()
cap = get_video_capture_and_frame_count(file_path)
# wait for image retrieval to be ready
time.sleep(2)
return stop, cap, video, steps, prev_frame_none, frames_counter
def video_file_to_ndarray(i, file_path, n_frames_per_video, height, width,
n_channels, num_real_image_channel,
dense_optical_flow, number_of_videos):
cap, frame_count = get_video_capture_and_frame_count(file_path)
take_all_frames = False
# if not all frames are to be used, we have to skip some -> set step size accordingly
if n_frames_per_video == 'all':
take_all_frames = True
video = np.zeros((frame_count, height, width, n_channels), dtype=np.uint8)
steps = frame_count
n_frames = frame_count
else:
video = np.zeros((n_frames_per_video, height, width, n_channels),
dtype=np.uint8)
steps = int(math.floor(frame_count / n_frames_per_video))
n_frames = n_frames_per_video
assert not (frame_count < 1 or steps < 1), str(
file_path) + " does not have enough frames. Skipping video."
# variables needed
image = np.zeros((height, width, num_real_image_channel),
dtype=FLAGS.image_color_depth)
frames_counter = 0
prev_frame_none = False
restart = True
image_prev = None
while restart:
for f in range(frame_count):
if math.floor(f % steps) == 0 or take_all_frames:
frame = get_next_frame(cap)
# unfortunately opencv uses bgr color format as default
if frame is not None:
frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
# special case handling: opencv's frame count sometimes differs from real frame count -> repeat
if frame is None and frames_counter < n_frames:
stop, _, _1, _2, _3, _4 = repeat_image_retrieval(
cap, file_path, video, take_all_frames, steps, frame, prev_frame_none,
frames_counter)
if stop:
restart = False
break
else:
video[frames_counter, :, :, :].fill(0)
frames_counter += 1
else:
if frames_counter >= n_frames:
restart = False
break
# iterate over channels
for k in range(num_real_image_channel):
resizedImage = cv2.resize(frame[:, :, k], (width, height))
image[:, :, k] = resizedImage
if dense_optical_flow:
# optical flow requires at least two images, make the OF image appended to the first image just black
if image_prev is not None:
frame_flow = compute_dense_optical_flow(image_prev, image)
frame_flow = cv2.cvtColor(frame_flow, cv2.COLOR_BGR2GRAY)
else:
frame_flow = np.zeros((height, width))
image_prev = image.copy()
# assemble the video from the single images
if dense_optical_flow:
image_with_flow = image.copy()
image_with_flow = np.concatenate(
(image_with_flow, np.expand_dims(frame_flow, axis=2)), axis=2)
video[frames_counter, :, :, :] = image_with_flow
else:
video[frames_counter, :, :, :] = image
frames_counter += 1
else:
get_next_frame(cap)
print(str(i + 1) + " of " + str(
number_of_videos) + " videos within batch processed: ", file_path)
v = video.copy()
cap.release()
return v
def convert_video_to_numpy(filenames, n_frames_per_video, width, height,
n_channels, dense_optical_flow=False):
"""Generates an ndarray from multiple video files given by filenames.
Implementation chooses frame step size automatically for a equal separation distribution of the video images.
Args:
filenames: a list containing the full paths to the video files
width: width of the video(s)
height: height of the video(s)
n_frames_per_video: integer value of string. Specifies the number of frames extracted from each video. If set to 'all', all frames are extracted from the
videos and stored in the tfrecord. If the number is lower than the number of available frames, the subset of extracted frames will be selected equally
spaced over the entire video playtime.
n_channels: number of channels to be used for the tfrecords
type: processing type for video data
Returns:
if no optical flow is used: ndarray(uint8) of shape (v,i,h,w,c) with
v=number of videos, i=number of images, (h,w)=height and width of image,
c=channel, if optical flow is used: ndarray(uint8) of (v,i,h,w,
c+1)
"""
number_of_videos = len(filenames)
if dense_optical_flow:
# need an additional channel for the optical flow with one exception:
n_channels = 4
num_real_image_channel = 3
else:
# if no optical flow, make everything normal:
num_real_image_channel = n_channels
data = []
for i, file in enumerate(filenames):
try:
v = video_file_to_ndarray(i=i, file_path=file,
n_frames_per_video=n_frames_per_video,
height=height, width=width,
n_channels=n_channels,
num_real_image_channel=num_real_image_channel,
dense_optical_flow=dense_optical_flow,
number_of_videos=number_of_videos)
data.append(v)
except Exception as e:
print(e)
return np.array(data)