-
Notifications
You must be signed in to change notification settings - Fork 7
/
Copy pathcreate_feature.py
75 lines (63 loc) · 2.48 KB
/
create_feature.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
import os
import numpy as np
import Bio
from Bio import SeqIO
import pickle as pkl
import argparse
parser = argparse.ArgumentParser(description='manual to this script')
parser.add_argument('--mode', type=str, default = 'virus')
inputs = parser.parse_args()
def return_4mer(file_in_fn):
# alphbet
k_list = ["A", "C", "G", "T"]
nucl_list = ["A", "C", "G", "T"]
for i in range(3):
tmp = []
for item in nucl_list:
for nucl in k_list:
tmp.append(nucl+item)
k_list = tmp
# dictionary
mer2dict = {mer: idx for idx, mer in enumerate(k_list)}
# search files
file_list = os.listdir(file_in_fn)
num_file = len(file_list)
file2idx = {}
# convert to words
feature = np.zeros((num_file, 256))
for idx, file in enumerate(file_list):
file2idx[file.rsplit('.', 1)[0]] = idx
for record in SeqIO.parse(file_in_fn + file, 'fasta'):
seq = str(record.seq)
seq = seq.upper()
for pos in range(len(seq)-3):
try:
feature[idx][mer2dict[seq[pos:pos+4]]] += 1
except:
#print(seq[pos:pos+4])
pass
# nomarlization
norm_feature = np.zeros((num_file, 256))
for i in range(len(feature)):
norm_feature[i] = (feature[i] - np.min(feature[i]))/(np.max(feature[i]) - np.min(feature[i]))
return norm_feature, file2idx
virus, virus2id = return_4mer('train_phage/')
pkl.dump(virus2id, open('node_feature/virus.dict', 'wb'))
pkl.dump(virus, open('node_feature/virus.F', 'wb'))
prokaryote, prokaryote2id = return_4mer('prokaryote/')
pkl.dump(prokaryote2id, open('node_feature/prokaryote.dict', 'wb'))
pkl.dump(prokaryote, open('node_feature/prokaryote.F', 'wb'))
if inputs.mode == 'virus':
test_virus, test_virus2id = return_4mer('single_contig/')
pkl.dump(test_virus2id, open('node_feature/test_virus.dict', 'wb'))
pkl.dump(test_virus, open('node_feature/test_virus.F', 'wb'))
elif inputs.mode == 'prokaryote':
test_prokaryote, test_prokaryote2id = return_4mer('new_prokaryote/')
pkl.dump(test_prokaryote2id, open('node_feature/test_prokaryote.dict', 'wb'))
pkl.dump(test_prokaryote, open('node_feature/test_prokaryote.F', 'wb'))
test_virus, test_virus2id = return_4mer('single_contig/')
pkl.dump(test_virus2id, open('node_feature/test_virus.dict', 'wb'))
pkl.dump(test_virus, open('node_feature/test_virus.F', 'wb'))
else:
print('wrong parameters')
exit()