-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathuniprot.py
211 lines (160 loc) · 7 KB
/
uniprot.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
#!/usr/bin/python
'''
This script pulls protein specific information from UNIPROT's API. It parses the
protein data and extracts certain features. For usage see README
'''
from collections import defaultdict
# import requests
from Bio import ExPASy, SwissProt
import pandas as pd
import StringIO
server = 'http://www.uniprot.org/uniprot'
def do_request(server, ID='', **kwargs):
# type: (object, object, object) -> object
params = ''
req = requests.get('%s/%s%s' % (server, ID, params), params=kwargs)
if not req.ok:
req.raise_for_status()
return req
#Method for extracting features for each protein including Gene Ontology
def extract_features(protein):
done_features = set()
print(len(protein.features))
for feature in protein.features:
if feature[0] in done_features:
continue
else:
done_features.add(feature[0])
print(feature)
print(len(protein.cross_references))
per_source = defaultdict(list)
for xref in protein.cross_references:
source = xref[0]
per_source[source].append(xref[1:])
print(per_source.keys())
done_GOs = set()
print(len(per_source['GO']))
for annot in per_source['GO']:
if annot[1][0] in done_GOs:
continue
else:
done_GOs.add(annot[1][0])
print(annot)
#p53
req = do_request(server, query='gene:p53 AND reviewed:yes AND organism:Human',
format='tab',
columns='entry name,length,organism,organism-id,database(PDB),database(HGNC)'
)
uniprot_list = pd.read_table(StringIO.StringIO(req.text))
uniprot_list.rename(columns={'Organism ID': 'ID'}, inplace=True)
print(uniprot_list)
# print(uniprot_list.iloc[:,[0]])
# print(uniprot_list.iloc[:,[1]])
# print(uniprot_list.iloc[:,[3]])
#additional features
p53_human = uniprot_list[uniprot_list.ID == 9606]['Entry name'].tolist()[0]
handle = ExPASy.get_sprot_raw(p53_human)
sp_rec = SwissProt.read(handle)
# print(sp_rec.entry_name, sp_rec.sequence_length, sp_rec.gene_name)
# print(sp_rec.description)
# print(sp_rec.organism, sp_rec.seqinfo)
# print(sp_rec.sequence)
# extract_features(sp_rec)
#myc
req = do_request(server, query='gene:myc AND reviewed:yes AND organism:Human',
format='tab',
columns='entry name,length,organism,organism-id,database(PDB),database(HGNC)'
)
uniprot_list1 = pd.read_table(StringIO.StringIO(req.text))
uniprot_list1.rename(columns={'Organism ID': 'ID'}, inplace=True)
print(uniprot_list1)
# print(uniprot_list1.iloc[:,[0]])
# print(uniprot_list1.iloc[:,[1]])
# print(uniprot_list1.iloc[:,[3]])
#additional myc analysis
myc = uniprot_list1[uniprot_list1.ID == 9606]['Entry name'].tolist()[0]
handle1 = ExPASy.get_sprot_raw(myc)
sp_rec1 = SwissProt.read(handle1)
# print(sp_rec1.entry_name, sp_rec.sequence_length, sp_rec.gene_name)
# print(sp_rec1.description)
# print(sp_rec1.organism, sp_rec.seqinfo)
# print(sp_rec1.sequence)
extract_features(sp_rec1)
#errb2
req = do_request(server, query='gene:her2 AND reviewed:yes AND organism:Human',
format='tab',
columns='entry name,length,organism,organism-id,database(PDB),database(HGNC)'
)
uniprot_list2 = pd.read_table(StringIO.StringIO(req.text))
uniprot_list2.rename(columns={'Organism ID': 'ID'}, inplace=True)
print(uniprot_list2)
# print(uniprot_list2.iloc[:,[0]])
# print(uniprot_list2.iloc[:,[1]])
# print(uniprot_list2.iloc[:,[3]])
#additional errb2 analysis
errb2 = uniprot_list2[uniprot_list2.ID == 9606]['Entry name'].tolist()[0]
handle2 = ExPASy.get_sprot_raw(errb2)
sp_rec2 = SwissProt.read(handle2)
# print(sp_rec2.entry_name, sp_rec2.sequence_length, sp_rec2.gene_name)
# print(sp_rec2.description)
# print(sp_rec2.organism, sp_rec2.seqinfo)
# print(sp_rec2.sequence)
# extract_features(sp_rec2)
#Epidermal Growth Factor
req = do_request(server, query='gene:egfr AND reviewed:yes AND organism:Human',
format='tab',
columns='entry name,length,organism,organism-id,database(PDB),database(HGNC)'
)
uniprot_list3 = pd.read_table(StringIO.StringIO(req.text))
uniprot_list3.rename(columns={'Organism ID': 'ID'}, inplace=True)
print(uniprot_list3)
# print(uniprot_list3.iloc[:,[0]])
# print(uniprot_list3.iloc[:,[1]])
# print(uniprot_list3.iloc[:,[3]])
#additional egfr analysis
egfr = uniprot_list3[uniprot_list3.ID == 9606]['Entry name'].tolist()[0]
handle3 = ExPASy.get_sprot_raw(egfr)
sp_rec3 = SwissProt.read(handle3)
# print(sp_rec3.entry_name, sp_rec3.sequence_length, sp_rec3.gene_name)
# print(sp_rec3.description)
# print(sp_rec3.organism, sp_rec3.seqinfo)
# print(sp_rec3.sequence)
# extract_features(sp_rec3)
#Pten
req = do_request(server, query='gene:pten AND reviewed:yes AND organism:Human',
format='tab',
columns='entry name,length,organism,organism-id,database(PDB),database(HGNC)'
)
uniprot_list4 = pd.read_table(StringIO.StringIO(req.text))
uniprot_list4.rename(columns={'Organism ID': 'ID'}, inplace=True)
print(uniprot_list4)
# print(uniprot_list4.iloc[:,[0]])
# print(uniprot_list4.iloc[:,[1]])
# print(uniprot_list4.iloc[:,[3]])
#additional pten analysis
pten = uniprot_list4[uniprot_list4.ID == 9606]['Entry name'].tolist()[0]
handle4 = ExPASy.get_sprot_raw(pten)
sp_rec4 = SwissProt.read(handle4)
# print(sp_rec4.entry_name, sp_rec4.sequence_length, sp_rec4.gene_name)
# print(sp_rec4.description)
# print(sp_rec4.organism, sp_rec4.seqinfo)
# print(sp_rec4.sequence)
extract_features(sp_rec4)
# should be done using a better way, extracting data from uniprot API and hardcoding it defeats the purpose
# pass this to proteomics datatbase
data = [
(1,'P53', 393, sp_rec.description, "GO:0000785, C:chromatin, IBA:GO_Central GO:0005524, F:ATP binding, \
IDA:UniProtKB GO:0006915, \
P:apoptotic process, TAS:Reactome"),
(2,'MYC', 439, sp_rec2.description, "GO:0005829, C:cytosol, TAS:Reactome \
GO:0003677, F:DNA binding, ISS:UniProtKB) \
GO:1904837, P:beta-catenin-TCF complex assembly, TAS:Reactome"),
(3,'ERRB2',1255, sp_rec2.description, "GO:0016324, C:apical plasma membrane, IEA:Ensembl \
GO:0005524, F:ATP binding', IEA:UniProtKB-KW \
GO:0008283, P:cell proliferation, TAS:ProtInc"),
(4,'EGFR', 1210, sp_rec3.description, "GO:0009897, C:external side of plasma membrane, IDA:MGI \
GO:0030246, F:carbohydrate binding', 'IEA:UniProtKB-KW"),
(5,'PTEN', 403, sp_rec4.description, "GO:0016324, C:apical plasma membrane, IMP:UniProtKB \
GO:0010997', F:anaphase-promoting complex binding, IPI:BHF-UCL \
GO:0030534, P:adult behavior, IEA:Ensembl")
]