-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path_Retrieve_D2P2_Consensus_Data.py
85 lines (63 loc) · 3.69 KB
/
_Retrieve_D2P2_Consensus_Data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
import os
import pandas as pd
import json
from urllib.request import urlopen
### Purpose of this Script is to Query and get D2P2 Disorder Consensus Data
# Either store as CSV or manipulate the data and then store as CSV
# Store the data in this fashion: ["Position', "Score"]
# Store as name: "D2P2_[ProteinName]_[OrganismName].csv"
def UniProtD2P2Data(query, inputPath):
# Global Variables
capitalizeQuery = query.capitalize()
col_list = ['Entry', 'Gene', 'Organism']
# Paths/Files/Output
corePath = "{path}Core/".format(path=inputPath)
nonCorePath = "{path}NonCore/".format(path=inputPath)
inputCoreFile = "{path}Core/{studying}_Core_Proteins.csv".format(path=inputPath, studying=capitalizeQuery)
inputNonCoreFile = "{path}NonCore/{studying}_NonCore_Proteins.csv".format(path=inputPath, studying=capitalizeQuery)
# Functions
def OpenCSV(FileInput):
df = pd.read_csv(FileInput, usecols=col_list)
entryList = df['Entry'].tolist()
geneList = df['Gene'].tolist()
organismList = df['Organism'].tolist()
return entryList, geneList, organismList
def D2P2RawData(entryList, geneList, orgnList, coreOrNonCore):
print('\n')
def GetPositionRange(lst):
positionList = []
lengthList = len(lst)
for index in range(1, lengthList + 1):
positionList.append(index)
return positionList
for i, seqID in enumerate(entryList):
print("Querying D2P2", coreOrNonCore, ": ", geneList[i], "for", orgnList[i], "... ", end="")
requestURL = 'http://d2p2.pro/api/seqid/["{seqid}"]'.format(seqid=seqID)
response = json.loads(urlopen(requestURL).read())
consensusList = []
for consensus in response["{seqid}".format(seqid=seqID)][0][2]["disorder"]["consensus"]:
consensusList.append(consensus)
positionList = GetPositionRange(consensusList)
WriteToCSV(positionList, consensusList, geneList[i], orgnList[i], coreOrNonCore)
print("Done.")
def WriteToCSV(posLst, d2p2Lst, geneName, orgnName, nonCoreOrCore):
if not os.path.exists('{pathCore}_{orgnName}/'.format(pathCore=corePath, orgnName=orgnName)):
os.makedirs('{pathCore}_{orgnName}/'.format(pathCore=corePath, orgnName=orgnName))
if not os.path.exists('{pathNonCore}_{orgnName}/'.format(pathNonCore=nonCorePath, orgnName=orgnName)):
os.makedirs('{pathNonCore}_{orgnName}/'.format(pathNonCore=nonCorePath, orgnName=orgnName))
outputPathCore = "{pathCore}_{orgnName}/".format(pathCore=corePath, orgnName=orgnName)
outputPathNonCore = "{pathNonCore}_{orgnName}/".format(pathNonCore=nonCorePath, orgnName=orgnName)
df = pd.DataFrame(list(zip(posLst, d2p2Lst)), columns=["Position", "Score"])
if nonCoreOrCore == "Core":
df.to_csv("{outputPath}{studying}_{geneName}_D2P2_Consensus.csv".format(studying=capitalizeQuery, outputPath=outputPathCore, geneName=geneName),
header=["Position", "Score"], index=False)
if nonCoreOrCore == "NonCore":
df.to_csv("{outputPath}{studying}_{geneName}_D2P2_Consensus.csv".format(studying=capitalizeQuery, outputPath=outputPathNonCore, geneName=geneName),
header=["Position", "Score"], index=False)
###----Execution---###
# Opening the CSV files
coreEntryLst, coreGeneLst, coreOrgnLst = OpenCSV(inputCoreFile)
nonCoreEntryLst, nonCoreGeneLst, nonCoreOrgnLst = OpenCSV(inputNonCoreFile)
# Retrieving D2P2 Data
D2P2RawData(coreEntryLst, coreGeneLst, coreOrgnLst, "Core")
D2P2RawData(nonCoreEntryLst, nonCoreGeneLst, nonCoreOrgnLst, "NonCore")