-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathjsonlines.py
142 lines (122 loc) · 4.66 KB
/
jsonlines.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
'''
Created on Sep 10, 2017
@author: Clark
'''
import matplotlib.pyplot as plt
import os, sys, time, json
from urllib import parse
from hashlib import blake2b
def check_create_path(path):
'''
Create path for output
'''
if not os.path.isdir(path):
os.makedirs (path)
def get_information(directory):
'''
Get file name, create time, and file size information
'''
file_list = []
for i in os.listdir(directory):
file = os.stat(os.path.join(directory,i))
file_list.append([i,time.ctime(file.st_ctime), file.st_size]) #[file, created time, file size]
return file_list
def plot_histogram(file_sizes):
'''
Based on input size of files to plot the size vs number of files histogram
'''
plt.hist(file_sizes, bins=500, histtype='stepfilled', color='b', label='html file size')
plt.title("HTML file size/File number Histogram")
plt.xlabel("HTML File size: KBytes")
plt.ylabel("HTML File Numbers")
plt.legend()
plt.show()
def get_input (input_path, input_file):
'''
get input file by utf8 encoding. Read the file content then return it.
'''
with open(input_path+'/'+input_file, 'r', encoding='utf8') as f:
data = f.read()
f.close()
return data
def set_output(outfile_name, input_path, input_files, number_of_files):
'''
Generate JSON Lines format output
JSON Lines specification:
doc_id = unique number
url = file URL
raw_content = file content
timestamp_crawl: When the file was been collected.
'''
files_size = []
i = 0
with open(outfile_name, 'w') as outfile:
for file in input_files:
if number_of_files > i or number_of_files == -1:
file_name = file[0] #file name is URL
date_time = file[1]
file_size = file[2]/1000 #KB
data = get_input(input_path, file_name)
json.dump({'url':parse.unquote(file_name), 'timestamp_crawl':date_time, 'raw_content':data, 'doc_id':blake2b(file_name.encode('utf-8'), digest_size=32).hexdigest()}, outfile)
outfile.write('\n')
i += 1
files_size.append(file_size)
else:
break
outfile.close()
return files_size
def main(plot_picture, input_path=None, output_path=None, number_of_files = -1):
files_size = []
start = time.time()
input_files = get_information(input_path)
if output_path is not None:
# Generate JSON Lines
check_create_path(output_path)
outfile_name = output_path + '/CDR.jl'
files_size = set_output(outfile_name, input_path, input_files, number_of_files)
else :
# Only plot the HTML file size distribute
for file in input_files:
file_size = file[2]/1000 #KB
files_size.append(file_size)
end = time.time()
print('CDR file has been generated.')
print('runtime:', end - start)
if plot_picture == True:
plot_histogram(files_size)
else:
pass
if __name__ == '__main__':
# Get input and output parameters
if len(sys.argv) < 3:
print('Usage: python ' + sys.argv[0] + ' <d> <html files input path> [output_path] [number of files]')
print(' The program requires Python 3.6 to execute.')
print(' d : Only draw the html file size distribution.')
print(' html files input path = input file path of html files')
print(' output_path = Optional, output file path of JSON Lines format')
print(' number of files : Optional, number of html files you want to package into JSON Lines. Default value is all files.')
exit ()
if len(sys.argv) == 4:
if sys.argv[1] == 'd':
plot_picture = True
else:
plot_picture = False
html_files = sys.argv[2]
output_path = sys.argv[3]
main(plot_picture, html_files, output_path)
elif len(sys.argv) == 5:
if sys.argv[1] == 'd':
plot_picture = True
else:
plot_picture = False
html_files = sys.argv[2]
output_path = sys.argv[3]
number_of_files = int(sys.argv[4])
main(plot_picture, html_files, output_path, number_of_files)
elif len(sys.argv) == 3 and sys.argv[1] == 'd' :
if sys.argv[1] == 'd':
plot_picture = True
else:
plot_picture = False
html_files = sys.argv[2]
main(plot_picture, html_files)