-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathTXT_to_CSV_converter.py
94 lines (78 loc) · 3.19 KB
/
TXT_to_CSV_converter.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
# author: itamar bitton
'''
this script takes a path as an argument and convert a txt file containing a list of jsons to a csv file to all folders
and sub-folders in a asynchronous way order
'''
import csv
import os
import json
from MAC_Addr_File_Filter import create_output_directory
from csv import DictWriter
from multiprocessing import Pool
from functools import partial
''' CONSTANTS '''
output_path = 'D:/Dojo_data_logs/ipfix-09.2018(filtered)(csv files)'
folder_to_convert = 'D:/Dojo_data_logs/ipfix-09.2018(filtered)'
num_of_threads = 20
''' '''
def flattenjson(b, delim):
val = {}
for i in b.keys():
if isinstance(b[i], dict):
get = flattenjson(b[i], delim)
for j in get.keys():
val[i + delim + j] = get[j]
else:
val[i] = b[i]
return val
# function receive as an argument a path to a txt file containing a list of json format rows
# and replace the txt file with a matching csv file
def write_csv(path_to_txt, output_path):
os.chdir(os.path.dirname(path_to_txt))
output_file = output_path + '/' + os.path.basename(path_to_txt)[:-4] + '.csv'
with open(path_to_txt, 'r') as txt_file, open(output_file, 'w+', newline='') as csv_file:
writer = csv.writer(csv_file)
lines = txt_file.readlines()
''' first iteration on the file is to get all the different json keys due to the fact that not all json
have the same key set'''
key_set = set([])
for line in lines:
parsed_json = json.loads(line)
flat_parsed_json = flattenjson(parsed_json, "__")
for element in flat_parsed_json.keys():
key_set.add(element)
writer.writerow(list(key_set))
''' second iteration performs the actual writing of the data of each json row to its right columns '''
dict_writer = DictWriter(csv_file, list(key_set))
for line in lines:
row = {}
keys = list(key_set)
for i in range(0, len(keys)):
row[keys[i]] = 'NA'
parsed_json = json.loads(line)
flat_parsed_json = flattenjson(parsed_json, "__")
for key in flat_parsed_json.keys():
row[key] = flat_parsed_json[key]
dict_writer.writerow(row)
# function receive a path to a directory and convert all the files in the directory and -directories in an async way
# make sure that the output_path folder exists !
def convert(num_of_threads, path_to_convert, output_path):
txt_files = []
files_lst = os.listdir(path_to_convert)
pool = Pool(num_of_threads)
for f in files_lst:
os.chdir(path_to_convert)
if os.path.isdir(f):
new_output = output_path + '/' + f + '(csv files)'
create_output_directory(new_output)
convert(num_of_threads, path_to_convert + '/' + f, new_output)
else:
txt_files.append(path_to_convert + '/' + f)
partial_write = partial(write_csv, output_path=output_path)
pool.map(partial_write, txt_files)
pool.close()
pool.join()
def main():
convert(num_of_threads, folder_to_convert, output_path)
if __name__ == '__main__':
main()