-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathextract_external_data.py
78 lines (64 loc) · 2.21 KB
/
extract_external_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
__author__ = 'Sarah'
class ExtractExternalData:
def __init__(self, data_type):
self.data_type = data_type
self.data = []
self.supported_data = [{'datatype': 'psimod', 'datafile': './ontology_data/psimod_obo.txt'},
{'datatype': 'PRO', 'datafile': './ontology_data/pro_ref.csv'}]
def populate_data(self):
filename = self.get_data_file()
if not filename:
return
f = open(filename, 'r')
lines = f.readlines()
f.close()
if self.data_type == 'psimod':
self.get_psimod_terms(lines)
elif self.data_type == 'PRO':
self.get_pro_data(lines)
def get_data(self):
return self.data
def get_data_file(self):
for data in self.supported_data:
if data['datatype'] == self.data_type:
return data['datafile']
return None
def get_psimod_terms(self, lines):
for i in range(0, len(lines)):
if not lines[i].startswith('[Term]'):
continue
else:
if self.data_type == 'psimod':
term = self.get_psimod_term(lines, i)
if term:
self.data.append(term)
i += 3
def get_psimod_term(self, lines, i):
nextline = lines[i+1]
length = len(nextline)
id = nextline[4:length-1]
nextline = lines[i+2]
length = len(nextline)
name = nextline[6:length-1]
nextline = lines[i+3]
start = nextline.find('ChEBI:')
chebi = ''
if start != -1:
comma = nextline.find(',', start)
chebi = nextline[start+6: comma]
if chebi != '':
term = dict({'id': id, 'name': name, 'chebi': chebi})
return term
else:
return None
def get_pro_data(self, lines):
for line in lines:
if line.endswith('\n'):
length = len(line)
line = line[0:length-1]
vars = line.split(',')
if len(vars) < 2:
continue
term = dict({'id': vars[0], 'pro': vars[1]})
if term:
self.data.append(term)