tweaks to enumerate and conformers jobs

InformaticsMatters · Mar 28, 2024 · a3abc85 · a3abc85
1 parent 77dd1b3
commit a3abc85
Show file tree

Hide file tree

Showing 4 changed files with 74 additions and 54 deletions.
diff --git a/data-manager/docs/mordred/descriptor-generator.md b/data-manager/docs/mordred/descriptor-generator.md
@@ -49,17 +49,18 @@ It is perfectly OK to use `.smi` when generating 3D descriptors, but the 3D coor
 written as SMILES.
 
 ## Notes
-(1). If the molecules have multiple fragments many of the descriptors will not be generated correctly, so only use the 
+1. If the molecules have multiple fragments many of the descriptors will not be generated correctly, so only use the 
 'none' option if you are certain that all molecules only contain a single fragment.
-(2). 3D descriptors can only be generated from molecules with 3D coordinates in a SD-file 
-(3). These options apply only to delimited text files and are ignored for SD files.
-(4). If the input does not have a header line the field names are not know, so ones are created using the pattern *field1*, *field2* ...
-(5). There is no way of knowing all the fields that are present in a SD file without reading the data, and not all
+2. 3D descriptors can only be generated from molecules with 3D coordinates in a SD-file .
+3. These options apply only to delimited text files and are ignored for SD files.
+4. If the input does not have a header line the field names are not know, so ones are created using the pattern *field1*, *field2* ...
+5. There is no way of knowing all the fields that are present in a SD file without reading the data, and not all
 fields have to be present in every record. This means that new fields can be encountered as you process the file.
 Normally you can get the full list of fields by reading a small number of records, but this is not guaranteed. 
-By default 100 records are read which is usually enough, but in rare cases you might need to read more.
+By default, 100 records are read which is usually enough, but in rare cases you might need to read more.
 A new field could in theory appear in the last record!
 
 ## Related topics
 
-- [generate-low-energy-conformers job](../rdkit/generate-low-energy-conformers.md) can be used to generate conformers.
+- [enumerate-candidates job](../im-virtual-screening/enumerate-candidates.md) and
+[generate-low-energy-conformers job](../rdkit/generate-low-energy-conformers.md) can be used to generate conformers.
diff --git a/enumerate.py b/enumerate.py
@@ -92,6 +92,7 @@ def add_molecule(mydict, mol, code):
 def execute(input, output, delimiter=' ',
             id_column=None, mol_column=0,
             read_header=False, read_records=100,
+            fragment='hac',
             enumerate_chirals=False,
             enumerate_charges=False, enumerate_tautomers=False,
             combinatorial=False,
@@ -138,6 +139,10 @@ def execute(input, output, delimiter=' ',
             if interval and count % interval == 0:
                 DmLog.emit_event("Processed {} records".format(count))
 
+            if fragment != 'none':
+                mol = rdkit_utils.fragment(mol, fragment)
+                smi = Chem.MolToSmiles(mol)
+
             # if we have HAC filters apply them
             hac = mol.GetNumHeavyAtoms()
             if min_hac is not None and min_hac > hac:
@@ -219,7 +224,7 @@ def execute(input, output, delimiter=' ',
 def main():
 
     # Example:
-    #   ./enumerate.py -i bar.smi --enumerate-tautomers --enumerate-chirals --enumerate-charges
+    #   ./enumerate.py -i data/1000.smi -o bar.sdf --enumerate-tautomers --enumerate-chirals --enumerate-charges
 
     ### command line args definitions #########################################
 
@@ -234,7 +239,8 @@ def main():
                         help="Read a header line with the field names when reading .smi or .txt")
     parser.add_argument('--read-records', default=100, type=int,
                         help="Read this many records to determine the fields that are present")
-
+    parser.add_argument('-f', '--fragment', choices=['hac', 'mw', 'none'], default='hac',
+                        help='Strategy for picking largest fragment (mw or hac or none')
     parser.add_argument('--enumerate-charges', help='Enumerate charge forms', action='store_true')
     parser.add_argument('--enumerate-chirals', help='Enumerate undefined chiral centers', action='store_true')
     parser.add_argument('--enumerate-tautomers', help='Enumerate undefined chiral centers', action='store_true')
@@ -262,6 +268,7 @@ def main():
     mol_column = args.mol_column
     read_header = args.read_header
     read_records = args.read_records
+    fragment = args.fragment
     enumerate_charges = args.enumerate_charges
     enumerate_chirals = args.enumerate_chirals
     enumerate_tautomers = args.enumerate_tautomers
@@ -291,6 +298,7 @@ def main():
         execute(input, output,
                 delimiter=delimiter, id_column=id_column, mol_column=mol_column,
                 read_header=read_header, read_records=read_records,
+                fragment=fragment,
                 enumerate_chirals=enumerate_chirals,
                 enumerate_charges=enumerate_charges,
                 enumerate_tautomers=enumerate_tautomers,

diff --git a/le_conformers.py b/le_conformers.py
@@ -21,7 +21,7 @@
 See also le_conformers_for_mol.py that provides a simpler way to generate conformers for a single molecule using the
 same methodology that this module uses.
 """
-import os, argparse, traceback, time, gzip
+import os, sys, argparse, traceback, time, gzip
 
 import utils, rdkit_utils
 from dm_job_utilities.dm_log import DmLog
@@ -119,7 +119,6 @@ def execute(input, output, minimize_cycles=500,
     utils.expand_path(output)
 
     input_count = 0
-    enumerated_count = 0
     conformer_count = 0
     error_count = 0
 
@@ -129,50 +128,60 @@ def execute(input, output, minimize_cycles=500,
     # read the input records and write the output
     with gzip.open(output, 'wt') if output.endswith('.gz') else open(output, 'wt') as out:
         while True:
-            t = reader.read()
-            # break if no more data to read
-            if not t:
-                break
-            mol, smi, id, props = t
+            try:
+                t = reader.read()
+                # break if no more data to read
+                if not t:
+                    break
+                mol, smi, id, props = t
 
-            input_count += 1
+                input_count += 1
 
-            if interval and input_count % interval == 0:
-                DmLog.emit_event("Processed {} records".format(input_count))
+                if interval and input_count % interval == 0:
+                    DmLog.emit_event("Processed {} records".format(input_count))
 
-            if not mol:
-                error_count += 1
-                DmLog.emit_event("Failed to read molecule for record", input_count)
-                continue
+                if not mol:
+                    error_count += 1
+                    DmLog.emit_event("Failed to read molecule for record", input_count)
+                    continue
 
-            conf_count_for_mol = 0
+                conf_count_for_mol = 0
 
-            try:
                 molh = Chem.AddHs(mol)
 
                 mol_with_confs = gen_conformers(molh, rms_threshold, minimize_cycles, remove_hydrogens,
                                                 num_conformers=num_conformers)
+
                 for idx in range(mol_with_confs.GetNumConformers()):
+                    try:
+                        if id is not None:
+                            mol_with_confs.SetProp('ID', id)
+                            mol_with_confs.SetProp('_Name', id + '_' + str(conf_count_for_mol + 1))
+                        else:
+                            mol_with_confs.SetProp('_Name', str(input_count) + '_' + str(conf_count_for_mol + 1))
+
+                        mol_with_confs.SetIntProp("CONF_NUM", conf_count_for_mol + 1)
+                        mol_with_confs.SetDoubleProp('Energy',
+                                                     mol_with_confs.GetConformer(idx).GetDoubleProp('Energy'))
+                        mol_with_confs.SetDoubleProp('Energy_Delta',
+                                                     mol_with_confs.GetConformer(idx).GetDoubleProp(
+                                                         'Energy_Delta'))
+
+                        sdf_block = Chem.SDWriter.GetText(mol_with_confs, confId=idx)
+                        chg_block = rdkit_utils.updateChargeFlagInAtomBlock(sdf_block)
+                        out.write(chg_block)
+
+                        conformer_count += 1
+                        conf_count_for_mol += 1
+
+                    except KeyboardInterrupt:
+                        utils.log('Interrupted')
+                        sys.exit(0)
+                    except:
+                        utils.log('Error processing input {}, conformer {}'.format(input_count, conformer_count + 1))
+                        error_count += 1
+                        traceback.print_exc()
 
-                    if id is not None:
-                        mol_with_confs.SetProp('ID', id)
-                        mol_with_confs.SetProp('_Name', id + '_' + str(conf_count_for_mol + 1))
-                    else:
-                        mol_with_confs.SetProp('_Name', str(input_count) + '_' + str(conf_count_for_mol + 1))
-
-                    mol_with_confs.SetIntProp("CONF_NUM", conf_count_for_mol + 1)
-                    mol_with_confs.SetDoubleProp('Energy',
-                                                 mol_with_confs.GetConformer(idx).GetDoubleProp('Energy'))
-                    mol_with_confs.SetDoubleProp('Energy_Delta',
-                                                 mol_with_confs.GetConformer(idx).GetDoubleProp(
-                                                     'Energy_Delta'))
-
-                    sdf_block = Chem.SDWriter.GetText(mol_with_confs, confId=idx)
-                    chg_block = rdkit_utils.updateChargeFlagInAtomBlock(sdf_block)
-                    out.write(chg_block)
-
-                    conformer_count += 1
-                    conf_count_for_mol += 1
                 mol_with_confs.ClearProp('ID')
                 mol_with_confs.ClearProp('CONF_NUM')
                 mol_with_confs.ClearProp('Energy')
@@ -190,15 +199,15 @@ def execute(input, output, minimize_cycles=500,
     reader.close()
     DmLog.emit_cost(conformer_count)
 
-    return input_count, enumerated_count, conformer_count, error_count
+    return input_count, conformer_count, error_count
 
 
 ### start main execution #########################################
 
 def main():
 
     # Example:
-    #   python3 le_conformers.py -i bar.smi
+    #   python le_conformers.py -i bar.smi -o baz.sdf
 
     ### command line args definitions #########################################
 
@@ -229,7 +238,7 @@ def main():
     delimiter = utils.read_delimiter(args.delimiter)
 
     start = time.time()
-    input_count, enumerated_count, conformer_count, error_count = \
+    input_count, conformer_count, error_count = \
         execute(args.input, args.output,
                 delimiter=delimiter,
                 id_column=args.id_column,
@@ -244,7 +253,7 @@ def main():
                 )
     end = time.time()
 
-    DmLog.emit_event('Inputs:', input_count, 'Enumerated:', enumerated_count,
+    DmLog.emit_event('Inputs:', input_count,
                      'Conformers:', conformer_count, 'Errors:', error_count, 'Time (s):', end - start)
 
 

diff --git a/rdkit_utils.py b/rdkit_utils.py
@@ -150,6 +150,8 @@ def read(self):
         try:
             props = []
             mol = next(self.reader)
+            if mol is None:
+                return None, None, None, None
             smi = Chem.MolToSmiles(mol)
             if self.id_col:
                 if mol.HasProp(self.id_col):
@@ -166,8 +168,7 @@ def read(self):
                     props.append(mol.GetProp(name))
                 else:
                     props.append(None)
-            t = (mol, smi, id, props)
-            return t
+            return mol, smi, id, props
 
         except StopIteration:
             return None
@@ -370,7 +371,8 @@ def updateChargeFlagInAtomBlock(mb):
     This function is based on work by Jose Manuel Gally that can be found here:
     See https://sourceforge.net/p/rdkit/mailman/message/36425493/
     """
-    f="{:>10s}"*3+"{:>2}{:>4s}"+"{:>3s}"*11
+    # f="{:>10s}"*3 + "{:>2}{:>4s}" + "{:>3s}"*11
+    f = "{:>10s}"*3 + " {:>2}" + "{:>3s}"*12
     chgs = []    # list of charges
     lines = mb.split("\n")
     #if mb[0] == '' or mb[0] == "\n":
@@ -383,7 +385,7 @@ def updateChargeFlagInAtomBlock(mb):
         if l[0:6] == "M  CHG":
             records = l.split()[3:]    # M  CHG X is not needed for parsing, the info we want comes afterwards
             # record each charge into a list
-            for i in range(0,len(records),2):
+            for i in range(0,len(records), 2):
                 idx = records[i]
                 chg = records[i+1]
                 chgs.append((int(idx), int(chg)))    # sort tuples by first element?
@@ -430,9 +432,9 @@ def updateChargeFlagInAtomBlock(mb):
                     charge = '1'
                 else:
                     utils.log("ERROR! " + str(lines[0]) + "unknown charge flag: " + str(chg[1]))
-                    break
+                    charge = '0'
                 # update modatom block line
-                lines[i] = f.format(x,y,z,symb,massDiff,charge,sp,hc,scb,v,hd,nu1,nu2,aamn,irf,ecf)
+                lines[i] = f.format(x, y, z, symb, massDiff, charge, sp, hc, scb, v, hd, nu1, nu2, aamn, irf, ecf)
             i+=1
     upmb = "\n".join(lines)
     return(upmb)