removed some code duplication

made protein selection faster, 48ms -> 0.5ms on GRO testfile
MDAnalysis · Jul 5, 2020 · 00ba0ee · 00ba0ee
1 parent f48d4b4
commit 00ba0ee
Show file tree

Hide file tree

Showing 2 changed files with 23 additions and 38 deletions.
diff --git a/package/CHANGELOG b/package/CHANGELOG
@@ -26,7 +26,7 @@ Fixes
   * TOPParser no longer guesses elements when missing atomic number records
     (Issues #2449, #2651)
   * Testsuite does not any more matplotlib.use('agg') (#2191)
-  * In ChainReader, read_frame does not trigger change of iterating position. 
+  * In ChainReader, read_frame does not trigger change of iterating position.
     (Issue #2723, PR #2815)
 
 Enhancements
@@ -39,6 +39,8 @@ Enhancements
   * Added computation of Mean Squared Displacements (#2438, PR #2619)
   * Improved performances when parsing TPR files (PR #2804)
   * Added converter between Cartesian and Bond-Angle-Torsion coordinates (PR #2668)
+  * Improved performance of select_atoms on strings (e.g. name, type, resname) and
+    'protein' selection (#2751 PR #2755)
 
 Changes
   * Changes development status from Beta to Mature (Issue #2773)

diff --git a/package/MDAnalysis/core/selection.py b/package/MDAnalysis/core/selection.py
@@ -515,7 +515,7 @@ def apply(self, group):
         return group[mask]
 
 
-class StringSelection(Selection):
+class _ProtoStringSelection(Selection):
     """Selections based on text attributes
 
     .. versionchanged:: 1.0.0
@@ -540,11 +540,15 @@ def apply(self, group):
                 matches.append(ix)
 
         # atomname indices for members of this group
-        nmidx = nmattr.nmidx[group.ix]
+        nmidx = nmattr.nmidx[getattr(group, self.level)]
 
         return group[np.in1d(nmidx, matches)].unique
 
 
+class StringSelection(_ProtoStringSelection):
+    level = 'ix'  # operates on atom level attribute, i.e. '.ix'
+
+
 class AtomNameSelection(StringSelection):
     """Select atoms based on 'names' attribute"""
     token = 'name'
@@ -569,21 +573,8 @@ class AtomICodeSelection(StringSelection):
     field = 'icodes'
 
 
-class _ResidueStringSelection(StringSelection):
-    def apply(self, group):
-        # rather than work on group.names, cheat and look at the lookup table
-        nmattr = getattr(group.universe._topology, self.field)
-
-        matches = []  # list of passing indices
-        # iterate through set of known atom names, check which pass
-        for nm, ix in nmattr.namedict.items():
-            if any(fnmatch.fnmatch(nm, val) for val in self.values):
-                matches.append(ix)
-
-        # atomname indices for members of this group
-        nmidx = nmattr.nmidx[group.resindices]
-
-        return group[np.in1d(nmidx, matches)].unique    
+class _ResidueStringSelection(_ProtoStringSelection):
+    level= 'resindices'
 
 
 class ResidueNameSelection(_ResidueStringSelection):
@@ -598,25 +589,11 @@ class MoleculeTypeSelection(_ResidueStringSelection):
     field = 'moltypes'
 
 
-class SegmentNameSelection(StringSelection):
+class SegmentNameSelection(_ProtoStringSelection):
     """Select atoms based on 'segids' attribute"""
     token = 'segid'
     field = 'segids'
-
-    def apply(self, group):
-        # rather than work on group.names, cheat and look at the lookup table
-        nmattr = group.universe._topology.segids
-
-        matches = []  # list of passing indices
-        # iterate through set of known atom names, check which pass
-        for nm, ix in nmattr.namedict.items():
-            if any(fnmatch.fnmatch(nm, val) for val in self.values):
-                matches.append(ix)
-
-        # atomname indices for members of this group
-        nmidx = nmattr.nmidx[group.segindices]
-
-        return group[np.in1d(nmidx, matches)].unique    
+    level = 'segindices'
 
 
 class AltlocSelection(StringSelection):
@@ -845,7 +822,7 @@ class ProteinSelection(Selection):
     """
     token = 'protein'
 
-    prot_res = np.array([
+    prot_res = {
         # CHARMM top_all27_prot_lipid.rtf
         'ALA', 'ARG', 'ASN', 'ASP', 'CYS', 'GLN', 'GLU', 'GLY', 'HSD',
         'HSE', 'HSP', 'ILE', 'LEU', 'LYS', 'MET', 'PHE', 'PRO', 'SER', 'THR',
@@ -868,14 +845,20 @@ class ProteinSelection(Selection):
         'CLEU', 'CILE', 'CVAL', 'CASF', 'CASN', 'CGLN', 'CARG', 'CHID', 'CHIE',
         'CHIP', 'CTRP', 'CPHE', 'CTYR', 'CGLU', 'CASP', 'CLYS', 'CPRO', 'CCYS',
         'CCYX', 'CMET', 'CME', 'ASF',
-    ])
+    }
 
     def __init__(self, parser, tokens):
         pass
 
     def apply(self, group):
-        mask = np.in1d(group.resnames, self.prot_res)
-        return group[mask].unique
+        resname_attr = group.universe._topology.resnames
+        # which values in resname attr are in prot_res?
+        matches = [ix for (nm, ix) in resname_attr.namedict.items()
+                   if nm in self.prot_res]
+        # index of each atom's resname
+        nmidx = resname_attr.nmidx[group.resindices]
+        # intersect atom's resname index and matches to prot_res
+        return group[np.in1d(nmidx, matches)].unique
 
 
 class NucleicSelection(Selection):