-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathdataset_prepare.py
100 lines (71 loc) · 4.08 KB
/
dataset_prepare.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
import csv
import pandas as pd
import numpy as np
import glob
from Bio.PDB import PDBParser, PDBIO, Select # for needed model and chain
from pdbtools import pdb_fetch # for raw PDB download
import subprocess # for run bash script
from schrodinger import structure # for PRR
from schrodinger.protein import reliability # for PRR
# Upload dataset from CSV to pd.DataFrame
PDB_dataset = pd.read_csv("/path/to/datadrame.csv")
'''We should have such columns
'PDB' (PDB_id; str),
'WILD_TYPE' (naive one-letter residue; str),
'CHAIN' (chain, where mutation occurs; str),
'POSITION' (mutation residue number, could be with letters; str),
'MUTANT' (mutant one-letter residue; str),
'DDG' (experimental ddG; float)
If columns are different - delete unneeded and rename left as above'''
# Drop duplicates and save only mean value of ddG for one mutation (if there are values with different ddG sign for one mutation - delete it)
PDB_dataset = PDB_dataset.drop_duplicates()
PDB_dataset = PDB_dataset.groupby(['PDB', 'CHAIN', 'WILD_TYPE', 'POSITION', 'MUTANT'], as_index=False).filter(lambda x:(x.ddG.all() > 0) || (x.ddG.all() < 0))
PDB_dataset = PDB_dataset.groupby(['PDB', 'CHAIN', 'WILD_TYPE', 'POSITION', 'MUTANT'], as_index=False).aggregate({"ddG":"mean"})
# Create extra column PDB_CHAIN, will need it later
PDB_dataset["PDB_CHAIN"] = PDB_dataset["PDB"] + PDB_dataset["CHAIN"]
# Get unique PDB_id from dataset
PDB_unique = PDB_dataset["PDB"].unique()
# Download raw PDB
for PDB_id in PDB_unique:
PDB_file = pdb_fetch.run(PDB_id[:4])
with open(f"/path/to/folder/{PDB_id[:4]}.pdb",mode="a") as f:
for i in PDB_file:
f.write(i)
# Get unique PDB_CHAIN from dataset
PDB_id_chain_unique = PDB_dataset['PDB_CHAIN'].unique()
# Create class to remove heteroatoms and non-protein atoms
class NonHetSelect(Select):
def accept_residue(self, residue):
return 1 if residue.id[0] == " " else 0
# Save only 1 model and needed chain (or chains if several for one protein)
parser = PDBParser()
io = PDBIO()
for PDB_id_chain in PDB_id_chain_unique:
structure = parser.get_structure(PDB_id_chain[:4], f"/path/to/folder/{PDB_id_chain[:4]}.pdb")
chain = structure[0][PDB_id_chain[4]] # get 1 model and needed chain
io.set_structure(chain)
io.save(f"/path/to/folder/{PDB_id_chain[:5]}_model_1.pdb", NonHetSelect())
# Apply Protein Preparation Wizard (PPW) to all structures (can be done only in bash). Make sure you can run Schrodinger from command line
# As a result we get {PDB_CHAIN}_prep.pdb files
import subprocess
subprocess.run('dataset_prepwiz.sh')
# Check structures after PPW by Protein Reliability Report (PRR) for missing atoms, missung chains and steric clashes
errors = {}
for pdb_file in glob.glob("*_prep.pdb"):
st = structure.StructureReader.read(pdb_file)
check = reliability.run_quick_check(st)
if check.getTextSummary() != []:
errors[pdb_file] = check.getTextSummary()
structures_with_errors = []
for structure, report in errors.items():
problems = ['missing loops','missing atoms','steric clashes']
matching = [report_item for report_item in report if any(problem in report_item for problem in problems)]
if matching != []:
print("There is a problem with this structure:", structure)
structures_with_errors.append(structure[:5])
# Delete all structures from previous step from dataset
for structure in structures_with_errors:
PDB_dataset.drop(PDB_dataset[PDB_dataset['PDB_CHAIN'] == structure].index, inplace=True)
# Save filtered dataset
PDB_dataset.to_csv(f'/path/to/folder/dataset.csv', header=True, index=False, mode='w')
# As a result we get dataset.csv file with needed columns and bunch of {PDB_CHAIN}_prep.pdb files