-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathfile_split.py
154 lines (134 loc) · 5 KB
/
file_split.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
import os, sys
from settings import Settings
import datetime
import re
def tail(file, taillines=500, return_str=True, avg_line_length=None):
with open(file, errors='ignore') as f:
if not avg_line_length:
f.seek(0, 2)
f.seek(f.tell() - 3000)
avg_line_length = int(3000 / len(f.readlines())) + 10
f.seek(0, 2)
end_pointer = f.tell()
offset = taillines * avg_line_length
if offset > end_pointer:
f.seek(0, 0)
lines = f.readlines()[-taillines:]
return "".join(lines) if return_str else lines
offset_init = offset
i = 1
while len(f.readlines()) < taillines:
location = f.tell() - offset
f.seek(location)
i += 1
offset = i * offset_init
if f.tell() - offset < 0:
f.seek(0, 0)
break
else:
f.seek(end_pointer - offset)
lines = f.readlines()
if len(lines) >= taillines:
lines = lines[-taillines:]
return "".join(lines) if return_str else lines
def split_by_count(src_file, count):
filedir, name = os.path.split(src_file)
name, ext = os.path.splitext(name)
filedir = os.path.join(filedir, 'count')
if not os.path.exists(filedir):
os.mkdir(filedir)
partno = 0
stream = open(src_file, 'r', encoding='utf-8')
while True:
partfilename = os.path.join(filedir,name + '_' + str(partno) + ext)
print('write start %s' % partfilename)
part_stream = open(partfilename, 'w', encoding='utf-8')
read_count = 0
while read_count < count:
read_content = stream.readline()
if read_content:
part_stream.write(read_content)
else:
break
read_count += 1
part_stream.close()
if (read_count < count) :
break
partno += 1
def split_by_date(src_file):
filedir, name = os.path.split(src_file)
name, ext = os.path.splitext(name)
filedir = os.path.join(filedir, 'date')
if not os.path.exists(filedir):
os.mkdir(filedir)
stream = open(src_file, 'r', encoding='utf-8')
first_content = stream.readline()
first_dict = eval(first_content)
last_content = tail(src_file, 1)
last_dict = eval(last_content)
end_date = last_dict['@timestamp'].split("T")[0]
end_dt = datetime.datetime.strptime(end_date, "%Y-%m-%d")
date = first_dict['@timestamp'].split("T")[0]
while True:
partfilename = os.path.join(filedir, name + ext + '.' + str(date))
print('write start %s' % partfilename)
part_stream = open(partfilename, 'w', encoding='utf-8')
re_timestamp = re.compile(r'(?<="@timestamp":").*?(?=T)')
dt = datetime.datetime.strptime(date, "%Y-%m-%d")
while (dt <= end_dt):
read_content = stream.readline()
log_date = date # 解决日志格式不统一导致跳出循环的问题
if '@timestamp' in read_content:
# 效率太慢
# content_dict = eval(read_content)
# log_date = content_dict['@timestamp'].split("T")[0]
log_date = re_timestamp.search(read_content).group()
elif not read_content:
log_date = ''
if date == log_date:
part_stream.write(read_content)
else:
break
part_stream.close()
if (dt >= end_dt):
break
dt = dt + datetime.timedelta(1)
date = dt.strftime("%Y-%m-%d")
def return_cut_list(lst):
rt = []
n = 0
for i in range(len(lst)-1):
if lst[i].split('.')[-1] != lst[i+1].split('.')[-1]:
rt.append(lst[n:i+1])
n = i+1
rt.append(lst[n:])
return rt
def get_date(str):
date = str.split('.')[-1]
return date
def merge_by_date(src_dir):
filenames = []
merge_filedir = os.path.join(src_dir, 'date')
if not os.path.exists(merge_filedir):
os.mkdir(merge_filedir)
for filename in os.listdir(src_dir):
if not os.path.isdir(os.path.join(src_dir, filename)):
filenames.append(filename)
filenames = sorted(filenames, key=get_date)
filenames = return_cut_list(filenames)
for merge_list in filenames:
merge_filename = "gateway.log." + str(get_date(merge_list[0]))
merge_filename = os.path.join(merge_filedir, merge_filename)
print('write start %s' % merge_filename)
merge_stream = open(merge_filename, 'w', encoding='utf-8')
for filename in merge_list:
filename = os.path.join(src_dir, filename)
stream = open(filename, 'r', encoding='utf-8')
merge_stream.write(stream.read())
stream.close()
merge_stream.close()
if __name__ == '__main__':
src_file = './access_login/access.log'
src_dir = './gateway_login/'
# split_by_count(src_file, 500000)
merge_by_date(src_dir)