-
Notifications
You must be signed in to change notification settings - Fork 11
/
Copy paths3_tar.py
460 lines (375 loc) · 16 KB
/
s3_tar.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
import io
import json
import time
import boto3
import logging
import tarfile
import threading
from .s3_mpu import S3MPU
from .utils import _create_s3_client, _convert_to_bytes, _threads, MIN_S3_SIZE
logger = logging.getLogger(__name__)
class S3Tar:
def __init__(self, source_bucket, target_key,
target_bucket=None,
min_file_size=None,
cache_size=5,
save_metadata=False,
remove_keys=False,
allow_dups=False,
s3_max_retries=4,
part_size_multiplier=None,
session=boto3.session.Session()):
self.allow_dups = allow_dups
self.source_bucket = source_bucket
self.target_bucket = target_bucket
if self.target_bucket is None:
self.target_bucket = self.source_bucket
self.target_key = target_key
if min_file_size is not None:
self.min_file_size = _convert_to_bytes(min_file_size)
else:
self.min_file_size = None
if self.target_key.endswith('.tar'):
self.content_type = 'application/x-tar'
self.compression_type = None
elif self.target_key.endswith('.tar.gz'):
self.content_type = 'application/gzip'
self.compression_type = 'gz'
elif self.target_key.endswith('.tar.bz2'):
self.content_type = 'application/x-bzip2'
self.compression_type = 'bz2'
else:
raise ValueError("Invalid file extension: {}"
.format(self.target_key))
self.save_metadata = save_metadata
self.mode = 'w'
if self.compression_type is not None:
self.mode += '|' + self.compression_type
if part_size_multiplier is None:
self.part_size_multiplier = 10
else:
if (not isinstance(part_size_multiplier, int)
or int(part_size_multiplier) <= 0):
logger.warning("Part size multiplier must be >= 0."
" Defaulting to 10")
self.part_size_multiplier = 10
self.part_size_multiplier = part_size_multiplier
self.all_keys = set() # Keys the user adds
self.keys_to_delete = set() # Keys to delete on cleanup
self.remove_keys = remove_keys
self.file_cache = [] # io objects that are ready to be combined
self.cache_size = cache_size
if self.cache_size is None or self.cache_size <= 0:
raise ValueError("cache size must be 1 or larger")
self.s3_max_retries = s3_max_retries
if self.s3_max_retries is None or self.s3_max_retries <= 0:
raise ValueError("s3 max retries must be 1 or larger")
self.s3 = _create_s3_client(
session,
pool_size=self.cache_size * 2,
max_retries=self.s3_max_retries,
)
def tar(self):
"""Start the tar'ing process with what has been added
"""
# Kick off a job to start download sources in the background
cache_t = threading.Thread(target=self._pre_fetch_files)
cache_t.daemon = True
cache_t.start()
# Keep creating new tar(.gz) as long as there are files left
file_number = 0
while self._is_complete() is False:
file_number += 1
self._new_file_upload(file_number)
self._cleanup()
def _cleanup(self):
"""Remove source keys from s3
"""
if self.remove_keys is True:
logger.info("Removing all keys added to the tar {filename}"
.format(filename=self.target_key))
while len(self.keys_to_delete) > 0:
delete_these = []
while len(delete_these) < 1000 and len(self.keys_to_delete) > 0:
delete_these.append({'Key': self.keys_to_delete.pop()[1]})
logger.debug("Removing {} keys from {}"
.format(len(delete_these), self.source_bucket))
resp = self.s3.delete_objects(
Bucket=self.source_bucket,
Delete={'Objects': delete_these}
)
logger.debug("Delete objects response: {}".format(resp))
# TODO: Clear the whole class
self.s3 = None # Clear all current connections
def _new_file_upload(self, file_number):
"""Start a new multipart upload for the tar file
Args:
file_number (int): The number of this file getting created
"""
result_filepath = self._add_file_number(file_number)
# Start multipart upload
mpu = S3MPU(self.s3, self.target_bucket, result_filepath)
current_file_size = 0
# If out of files or min size is met, then complete file
while (self._is_complete() is False
and (self.min_file_size is None
or current_file_size < self.min_file_size)):
current_part_io = self._get_part_contents()
current_file_size += current_part_io.tell()
mpu.upload_part(current_part_io)
mpu.complete()
def _get_part_contents(self):
"""Create multipart upload contents
Pull files from file cache and append until file is large enough
to be uploaded to s3's multi prt upload
Returns:
BytesIO: io.BytesIO object to be upload
"""
current_io = io.BytesIO()
current_size = 0
while current_size < MIN_S3_SIZE * self.part_size_multiplier:
source_tar_io = self._get_file_from_cache()
if source_tar_io is None:
# Must be the end since no more files to add
break
source_tar_io.seek(0)
current_io.write(source_tar_io.read())
source_tar_io.close() # Cleanup
# New current size
current_size = current_io.tell()
return current_io
def _get_file_from_cache(self):
"""Pull content from the file cache to build a part to get uploaded
Returns:
BytesIO: io.BytesIO object
"""
while self._is_complete() is False:
if self.file_cache != []:
# Must pop from idx 0
# the last item in the file cache has the EOF bytes
return self.file_cache.pop(0)
time.sleep(0.1)
return None
def _add_file_number(self, file_number):
"""Add file number to tar file if needed
If its possible that there may need to be multiple tar files,
number them so they do not get overwritten.
This would happen if self.min_file_size is set
Args:
file_number (int): The number to give the file
Returns:
str: The filename to use in s3
"""
result_filepath = self.target_key
if self.min_file_size is not None:
# Need to number since the number of files is unknown
compression_ext = ''
if self.compression_type is not None:
compression_ext = '.' + self.compression_type
result_filepath = '{}-{}.tar{}'.format(
result_filepath.rsplit('.tar', 1)[0],
file_number,
compression_ext,
)
return result_filepath
def _is_complete(self):
"""Are they any files left to be uploaded/tar'd
Returns:
bool: If we can complete this tar'ing process or not
"""
return self.all_keys == set() and self.file_cache == []
def _pre_fetch_files(self):
"""Started as a background job to keep adding files to the
file cache to speed things along
"""
def _fetch(item):
while len(self.file_cache) >= self.cache_size:
# Hold here until more files are needed in the cache
time.sleep(0.1)
tar_member_name, key = item
logger.debug("Adding to cache {}".format(key))
self._add_key_to_cache(tar_member_name, key)
_threads(self.cache_size, self.all_keys, _fetch)
self.keys_to_delete = self.all_keys.copy()
self.all_keys = set() # clear now that all have been processed
def _add_key_to_cache(self, tar_member_name, key):
"""Get the source of an s3 key (and its metadata if needed) and add
it to the file cache
Args:
tar_member_name (str): Filename and path of the file inside the tar
key (str): the key to download form s3
"""
if self.save_metadata is True:
metadata_io = self._get_tar_source_metadata(tar_member_name, key)
if metadata_io is not None:
logger.debug("Adding metadata file to cache {}".format(key))
self.file_cache.append(metadata_io)
self.file_cache.append(self._get_tar_source_data(tar_member_name, key))
def _get_tar_source_data(self, tar_member_name, key):
"""Download source file and generate a tar from it
Args:
tar_member_name (str): Filename and path of the file inside the tar
key (str): File from s3 to download
Returns:
io.BytesIO: BytesIO object of the tar file
"""
source_key_io = self._download_source_file(key)
source_mtime = self._get_source_key_mtime(key)
source_tar_io = self._save_bytes_to_tar(
tar_member_name,
source_key_io,
source_mtime,
mode=self.mode,
)
source_key_io.close() # Cleanup
return source_tar_io
def _get_tar_source_metadata(self, tar_member_name, key):
"""Get metadata from the s3 file
If a file has metadata then add it to a tar file
If not, then return None
Args:
tar_member_name (str): Filename and path of the file inside the tar
key (str): S3 file to get metadata from
Returns:
io.BytesIO|None: BytesIO object of the tar file
"""
source_metadata_io = self._download_source_metadata(key)
if source_metadata_io is None:
return None
source_metadata_tar_io = self._save_bytes_to_tar(
tar_member_name + '.metadata.json',
source_metadata_io,
time.time(),
mode=self.mode,
)
source_metadata_io.close() # Cleanup
return source_metadata_tar_io
def _download_source_file(self, key):
"""Download source file from s3 into a BytesIO object
Args:
key (str): S3 file to download
Returns:
io.BytesIO: BytesIO object of the contents
"""
source_key_io = io.BytesIO()
self.s3.download_fileobj(self.source_bucket, key, source_key_io)
return source_key_io
def _download_source_metadata(self, key):
"""Get metadata from an s3 file
Args:
key (str): S3 file to pull metadata from
Returns:
io.BytesIO|None: BytesIO object of the tar file
"""
source_metadata_io = io.BytesIO()
metadata = self.s3.head_object(
Bucket=self.source_bucket,
Key=key,
)['Metadata']
if metadata == {}:
return None
source_metadata_io.write(json.dumps(metadata).encode('utf-8'))
return source_metadata_io
def _get_source_key_mtime(self, key):
return self.s3.head_object(
Bucket=self.source_bucket,
Key=key,
)['LastModified'].timestamp()
@classmethod
def _save_bytes_to_tar(cls, name, source_io, source_mtime, mode):
"""Convert raw bytes into a tar
Args:
name (str): Filename inside the tar
source_io (io.BytesIO): The data to be saved into the tar
source_mtime (): Last modified timestamp of the source file
mode (str): The file mode in which to open the tar file
Returns:
io.BytesIO: BytesIO object of the tar'd data
"""
source_tar_io = io.BytesIO()
tar = tarfile.open(fileobj=source_tar_io, mode=mode)
info = tarfile.TarInfo(name=name)
info.size = source_io.tell()
info.mtime = source_mtime
source_io.seek(0)
tar.addfile(tarinfo=info, fileobj=source_io)
if '|' in mode:
# When using compression, the data is
# not writted to the fileobj until its done
tar.fileobj.close()
return source_tar_io
def _raise_if_dup(self, tar_member_name, key, key_list=None):
if key_list is None:
key_list = self.all_keys
if self.allow_dups is False:
if any((True for x in key_list
if tar_member_name == x[0] and key != x[1])):
raise ValueError(("Filename '{member_name}' for key '{key}'"
" already exists in the tar file."
" Set allow_dups to continue.")
.format(member_name=tar_member_name,
key=key))
def add_files(self, prefix, folder='', preserve_paths=False):
"""Add s3 files from a directory inside the source bucket
self.all_keys gets added to directly
Args:
prefix (str): Folder path inside the source bucket
folder (str, optional): Folders to place the file inside
the tar file. Defaults to ''.
preserve_paths (bool, optional): Starting from the prefix, use
the path of the key in the tar file. Defaults to False.
"""
def resp_to_filelist(resp):
file_list = []
for x in resp['Contents']:
key = x['Key']
if key == prefix:
continue
# Get the paths after the prefix, and before the file
if preserve_paths is True:
tar_member_name = folder + key.replace(prefix, '')
else:
tar_member_name = folder + key.split('/')[-1]
self._raise_if_dup(tar_member_name, key, key_list=self.all_keys)
self._raise_if_dup(tar_member_name, key, key_list=file_list)
file_list.append((tar_member_name, key))
return file_list
# needs to end with '/'
if folder != '' and not folder.endswith('/'):
folder += '/'
logger.info("Gathering files from folder {}".format(prefix))
resp = self.s3.list_objects_v2(Bucket=self.source_bucket, Prefix=prefix)
if resp['KeyCount'] == 0:
logger.warning("No files found in the prefix {}".format(prefix))
return
file_list = resp_to_filelist(resp)
self.all_keys |= set(file_list)
total_file_count = len(file_list)
logger.debug("Found {} objects so far...".format(total_file_count))
while resp['IsTruncated']:
last_key = file_list[-1][1]
resp = self.s3.list_objects(
Bucket=self.source_bucket,
Prefix=prefix,
Marker=last_key,
)
file_list = resp_to_filelist(resp)
self.all_keys |= set(file_list)
total_file_count += len(file_list)
logger.debug("Found {} objects so far...".format(total_file_count))
logger.info("Found {} objects under the prefix '{}'"
.format(total_file_count, prefix))
def add_file(self, key, folder=''):
"""Add a single file at a time to be tar'd
self.all_keys gets added to directly
Args:
key (str): The full path to a single s3 file in the source bucket
folder (str, optional): Folders to place the file inside
the tar file. Defaults to ''.
"""
# TODO make sure key exists, if not log error and do not break
if folder != '' and not folder.endswith('/'):
folder += '/'
tar_member_name = folder + key.split('/')[-1]
self._raise_if_dup(tar_member_name, key)
self.all_keys.add((tar_member_name, key))