Skip to content

Commit

Permalink
tweaks to enumerate and conformers jobs
Browse files Browse the repository at this point in the history
  • Loading branch information
tdudgeon committed Mar 28, 2024
1 parent 77dd1b3 commit a3abc85
Show file tree
Hide file tree
Showing 4 changed files with 74 additions and 54 deletions.
15 changes: 8 additions & 7 deletions data-manager/docs/mordred/descriptor-generator.md
Original file line number Diff line number Diff line change
Expand Up @@ -49,17 +49,18 @@ It is perfectly OK to use `.smi` when generating 3D descriptors, but the 3D coor
written as SMILES.

## Notes
(1). If the molecules have multiple fragments many of the descriptors will not be generated correctly, so only use the
1. If the molecules have multiple fragments many of the descriptors will not be generated correctly, so only use the
'none' option if you are certain that all molecules only contain a single fragment.
(2). 3D descriptors can only be generated from molecules with 3D coordinates in a SD-file
(3). These options apply only to delimited text files and are ignored for SD files.
(4). If the input does not have a header line the field names are not know, so ones are created using the pattern *field1*, *field2* ...
(5). There is no way of knowing all the fields that are present in a SD file without reading the data, and not all
2. 3D descriptors can only be generated from molecules with 3D coordinates in a SD-file .
3. These options apply only to delimited text files and are ignored for SD files.
4. If the input does not have a header line the field names are not know, so ones are created using the pattern *field1*, *field2* ...
5. There is no way of knowing all the fields that are present in a SD file without reading the data, and not all
fields have to be present in every record. This means that new fields can be encountered as you process the file.
Normally you can get the full list of fields by reading a small number of records, but this is not guaranteed.
By default 100 records are read which is usually enough, but in rare cases you might need to read more.
By default, 100 records are read which is usually enough, but in rare cases you might need to read more.
A new field could in theory appear in the last record!

## Related topics

- [generate-low-energy-conformers job](../rdkit/generate-low-energy-conformers.md) can be used to generate conformers.
- [enumerate-candidates job](../im-virtual-screening/enumerate-candidates.md) and
[generate-low-energy-conformers job](../rdkit/generate-low-energy-conformers.md) can be used to generate conformers.
12 changes: 10 additions & 2 deletions enumerate.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,7 @@ def add_molecule(mydict, mol, code):
def execute(input, output, delimiter=' ',
id_column=None, mol_column=0,
read_header=False, read_records=100,
fragment='hac',
enumerate_chirals=False,
enumerate_charges=False, enumerate_tautomers=False,
combinatorial=False,
Expand Down Expand Up @@ -138,6 +139,10 @@ def execute(input, output, delimiter=' ',
if interval and count % interval == 0:
DmLog.emit_event("Processed {} records".format(count))

if fragment != 'none':
mol = rdkit_utils.fragment(mol, fragment)
smi = Chem.MolToSmiles(mol)

# if we have HAC filters apply them
hac = mol.GetNumHeavyAtoms()
if min_hac is not None and min_hac > hac:
Expand Down Expand Up @@ -219,7 +224,7 @@ def execute(input, output, delimiter=' ',
def main():

# Example:
# ./enumerate.py -i bar.smi --enumerate-tautomers --enumerate-chirals --enumerate-charges
# ./enumerate.py -i data/1000.smi -o bar.sdf --enumerate-tautomers --enumerate-chirals --enumerate-charges

### command line args definitions #########################################

Expand All @@ -234,7 +239,8 @@ def main():
help="Read a header line with the field names when reading .smi or .txt")
parser.add_argument('--read-records', default=100, type=int,
help="Read this many records to determine the fields that are present")

parser.add_argument('-f', '--fragment', choices=['hac', 'mw', 'none'], default='hac',
help='Strategy for picking largest fragment (mw or hac or none')
parser.add_argument('--enumerate-charges', help='Enumerate charge forms', action='store_true')
parser.add_argument('--enumerate-chirals', help='Enumerate undefined chiral centers', action='store_true')
parser.add_argument('--enumerate-tautomers', help='Enumerate undefined chiral centers', action='store_true')
Expand Down Expand Up @@ -262,6 +268,7 @@ def main():
mol_column = args.mol_column
read_header = args.read_header
read_records = args.read_records
fragment = args.fragment
enumerate_charges = args.enumerate_charges
enumerate_chirals = args.enumerate_chirals
enumerate_tautomers = args.enumerate_tautomers
Expand Down Expand Up @@ -291,6 +298,7 @@ def main():
execute(input, output,
delimiter=delimiter, id_column=id_column, mol_column=mol_column,
read_header=read_header, read_records=read_records,
fragment=fragment,
enumerate_chirals=enumerate_chirals,
enumerate_charges=enumerate_charges,
enumerate_tautomers=enumerate_tautomers,
Expand Down
87 changes: 48 additions & 39 deletions le_conformers.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
See also le_conformers_for_mol.py that provides a simpler way to generate conformers for a single molecule using the
same methodology that this module uses.
"""
import os, argparse, traceback, time, gzip
import os, sys, argparse, traceback, time, gzip

import utils, rdkit_utils
from dm_job_utilities.dm_log import DmLog
Expand Down Expand Up @@ -119,7 +119,6 @@ def execute(input, output, minimize_cycles=500,
utils.expand_path(output)

input_count = 0
enumerated_count = 0
conformer_count = 0
error_count = 0

Expand All @@ -129,50 +128,60 @@ def execute(input, output, minimize_cycles=500,
# read the input records and write the output
with gzip.open(output, 'wt') if output.endswith('.gz') else open(output, 'wt') as out:
while True:
t = reader.read()
# break if no more data to read
if not t:
break
mol, smi, id, props = t
try:
t = reader.read()
# break if no more data to read
if not t:
break
mol, smi, id, props = t

input_count += 1
input_count += 1

if interval and input_count % interval == 0:
DmLog.emit_event("Processed {} records".format(input_count))
if interval and input_count % interval == 0:
DmLog.emit_event("Processed {} records".format(input_count))

if not mol:
error_count += 1
DmLog.emit_event("Failed to read molecule for record", input_count)
continue
if not mol:
error_count += 1
DmLog.emit_event("Failed to read molecule for record", input_count)
continue

conf_count_for_mol = 0
conf_count_for_mol = 0

try:
molh = Chem.AddHs(mol)

mol_with_confs = gen_conformers(molh, rms_threshold, minimize_cycles, remove_hydrogens,
num_conformers=num_conformers)

for idx in range(mol_with_confs.GetNumConformers()):
try:
if id is not None:
mol_with_confs.SetProp('ID', id)
mol_with_confs.SetProp('_Name', id + '_' + str(conf_count_for_mol + 1))
else:
mol_with_confs.SetProp('_Name', str(input_count) + '_' + str(conf_count_for_mol + 1))

mol_with_confs.SetIntProp("CONF_NUM", conf_count_for_mol + 1)
mol_with_confs.SetDoubleProp('Energy',
mol_with_confs.GetConformer(idx).GetDoubleProp('Energy'))
mol_with_confs.SetDoubleProp('Energy_Delta',
mol_with_confs.GetConformer(idx).GetDoubleProp(
'Energy_Delta'))

sdf_block = Chem.SDWriter.GetText(mol_with_confs, confId=idx)
chg_block = rdkit_utils.updateChargeFlagInAtomBlock(sdf_block)
out.write(chg_block)

conformer_count += 1
conf_count_for_mol += 1

except KeyboardInterrupt:
utils.log('Interrupted')
sys.exit(0)
except:
utils.log('Error processing input {}, conformer {}'.format(input_count, conformer_count + 1))
error_count += 1
traceback.print_exc()

if id is not None:
mol_with_confs.SetProp('ID', id)
mol_with_confs.SetProp('_Name', id + '_' + str(conf_count_for_mol + 1))
else:
mol_with_confs.SetProp('_Name', str(input_count) + '_' + str(conf_count_for_mol + 1))

mol_with_confs.SetIntProp("CONF_NUM", conf_count_for_mol + 1)
mol_with_confs.SetDoubleProp('Energy',
mol_with_confs.GetConformer(idx).GetDoubleProp('Energy'))
mol_with_confs.SetDoubleProp('Energy_Delta',
mol_with_confs.GetConformer(idx).GetDoubleProp(
'Energy_Delta'))

sdf_block = Chem.SDWriter.GetText(mol_with_confs, confId=idx)
chg_block = rdkit_utils.updateChargeFlagInAtomBlock(sdf_block)
out.write(chg_block)

conformer_count += 1
conf_count_for_mol += 1
mol_with_confs.ClearProp('ID')
mol_with_confs.ClearProp('CONF_NUM')
mol_with_confs.ClearProp('Energy')
Expand All @@ -190,15 +199,15 @@ def execute(input, output, minimize_cycles=500,
reader.close()
DmLog.emit_cost(conformer_count)

return input_count, enumerated_count, conformer_count, error_count
return input_count, conformer_count, error_count


### start main execution #########################################

def main():

# Example:
# python3 le_conformers.py -i bar.smi
# python le_conformers.py -i bar.smi -o baz.sdf

### command line args definitions #########################################

Expand Down Expand Up @@ -229,7 +238,7 @@ def main():
delimiter = utils.read_delimiter(args.delimiter)

start = time.time()
input_count, enumerated_count, conformer_count, error_count = \
input_count, conformer_count, error_count = \
execute(args.input, args.output,
delimiter=delimiter,
id_column=args.id_column,
Expand All @@ -244,7 +253,7 @@ def main():
)
end = time.time()

DmLog.emit_event('Inputs:', input_count, 'Enumerated:', enumerated_count,
DmLog.emit_event('Inputs:', input_count,
'Conformers:', conformer_count, 'Errors:', error_count, 'Time (s):', end - start)


Expand Down
14 changes: 8 additions & 6 deletions rdkit_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -150,6 +150,8 @@ def read(self):
try:
props = []
mol = next(self.reader)
if mol is None:
return None, None, None, None
smi = Chem.MolToSmiles(mol)
if self.id_col:
if mol.HasProp(self.id_col):
Expand All @@ -166,8 +168,7 @@ def read(self):
props.append(mol.GetProp(name))
else:
props.append(None)
t = (mol, smi, id, props)
return t
return mol, smi, id, props

except StopIteration:
return None
Expand Down Expand Up @@ -370,7 +371,8 @@ def updateChargeFlagInAtomBlock(mb):
This function is based on work by Jose Manuel Gally that can be found here:
See https://sourceforge.net/p/rdkit/mailman/message/36425493/
"""
f="{:>10s}"*3+"{:>2}{:>4s}"+"{:>3s}"*11
# f="{:>10s}"*3 + "{:>2}{:>4s}" + "{:>3s}"*11
f = "{:>10s}"*3 + " {:>2}" + "{:>3s}"*12
chgs = [] # list of charges
lines = mb.split("\n")
#if mb[0] == '' or mb[0] == "\n":
Expand All @@ -383,7 +385,7 @@ def updateChargeFlagInAtomBlock(mb):
if l[0:6] == "M CHG":
records = l.split()[3:] # M CHG X is not needed for parsing, the info we want comes afterwards
# record each charge into a list
for i in range(0,len(records),2):
for i in range(0,len(records), 2):
idx = records[i]
chg = records[i+1]
chgs.append((int(idx), int(chg))) # sort tuples by first element?
Expand Down Expand Up @@ -430,9 +432,9 @@ def updateChargeFlagInAtomBlock(mb):
charge = '1'
else:
utils.log("ERROR! " + str(lines[0]) + "unknown charge flag: " + str(chg[1]))
break
charge = '0'
# update modatom block line
lines[i] = f.format(x,y,z,symb,massDiff,charge,sp,hc,scb,v,hd,nu1,nu2,aamn,irf,ecf)
lines[i] = f.format(x, y, z, symb, massDiff, charge, sp, hc, scb, v, hd, nu1, nu2, aamn, irf, ecf)
i+=1
upmb = "\n".join(lines)
return(upmb)
Expand Down

0 comments on commit a3abc85

Please sign in to comment.