From f48d4b44184001e87fddde9f6ca3ed5a14b8b520 Mon Sep 17 00:00:00 2001
From: richard <richard@nextmovesoftware.com>
Date: Sun, 21 Jun 2020 20:59:06 +0100
Subject: [PATCH] use dict-lookup string attrs EVERYWHERERE

---
 package/MDAnalysis/core/selection.py     |  57 +++++---
 package/MDAnalysis/core/topologyattrs.py | 166 ++++++++++++++++++-----
 2 files changed, 176 insertions(+), 47 deletions(-)

diff --git a/package/MDAnalysis/core/selection.py b/package/MDAnalysis/core/selection.py
index 40b3e5a8a24..c6b2e779816 100644
--- a/package/MDAnalysis/core/selection.py
+++ b/package/MDAnalysis/core/selection.py
@@ -529,22 +529,9 @@ def __init__(self, parser, tokens):
         self.values = vals
 
     @return_empty_on_apply
-    def apply(self, group):
-        mask = np.zeros(len(group), dtype=np.bool)
-        values = getattr(group, self.field)
-        for val in self.values:
-            mask |= [fnmatch.fnmatch(x, val) for x in values]
-        return group[mask].unique
-
-
-class AtomNameSelection(StringSelection):
-    """Select atoms based on 'names' attribute"""
-    token = 'name'
-    field = 'names'
-
     def apply(self, group):
         # rather than work on group.names, cheat and look at the lookup table
-        nmattr = group.universe._topology.names
+        nmattr = getattr(group.universe._topology, self.field)
 
         matches = []  # list of passing indices
         # iterate through set of known atom names, check which pass
@@ -558,6 +545,12 @@ def apply(self, group):
         return group[np.in1d(nmidx, matches)].unique
 
 
+class AtomNameSelection(StringSelection):
+    """Select atoms based on 'names' attribute"""
+    token = 'name'
+    field = 'names'
+
+
 class AtomTypeSelection(StringSelection):
     """Select atoms based on 'types' attribute"""
     token = 'type'
@@ -576,13 +569,30 @@ class AtomICodeSelection(StringSelection):
     field = 'icodes'
 
 
-class ResidueNameSelection(StringSelection):
+class _ResidueStringSelection(StringSelection):
+    def apply(self, group):
+        # rather than work on group.names, cheat and look at the lookup table
+        nmattr = getattr(group.universe._topology, self.field)
+
+        matches = []  # list of passing indices
+        # iterate through set of known atom names, check which pass
+        for nm, ix in nmattr.namedict.items():
+            if any(fnmatch.fnmatch(nm, val) for val in self.values):
+                matches.append(ix)
+
+        # atomname indices for members of this group
+        nmidx = nmattr.nmidx[group.resindices]
+
+        return group[np.in1d(nmidx, matches)].unique    
+
+
+class ResidueNameSelection(_ResidueStringSelection):
     """Select atoms based on 'resnames' attribute"""
     token = 'resname'
     field = 'resnames'
 
 
-class MoleculeTypeSelection(StringSelection):
+class MoleculeTypeSelection(_ResidueStringSelection):
     """Select atoms based on 'moltypes' attribute"""
     token = 'moltype'
     field = 'moltypes'
@@ -593,6 +603,21 @@ class SegmentNameSelection(StringSelection):
     token = 'segid'
     field = 'segids'
 
+    def apply(self, group):
+        # rather than work on group.names, cheat and look at the lookup table
+        nmattr = group.universe._topology.segids
+
+        matches = []  # list of passing indices
+        # iterate through set of known atom names, check which pass
+        for nm, ix in nmattr.namedict.items():
+            if any(fnmatch.fnmatch(nm, val) for val in self.values):
+                matches.append(ix)
+
+        # atomname indices for members of this group
+        nmidx = nmattr.nmidx[group.segindices]
+
+        return group[np.in1d(nmidx, matches)].unique    
+
 
 class AltlocSelection(StringSelection):
     """Select atoms based on 'altLoc' attribute"""
diff --git a/package/MDAnalysis/core/topologyattrs.py b/package/MDAnalysis/core/topologyattrs.py
index f36b58fda5f..d6328780765 100644
--- a/package/MDAnalysis/core/topologyattrs.py
+++ b/package/MDAnalysis/core/topologyattrs.py
@@ -473,16 +473,7 @@ def _gen_initial_values(na, nr, ns):
         return np.arange(1, na + 1)
 
 
-# TODO: update docs to property doc
-class Atomnames(AtomAttr):
-    """Name for each atom.
-    """
-    attrname = 'names'
-    singular = 'name'
-    per_object = 'atom'
-    dtype = object
-    transplants = defaultdict(list)
-
+class _AtomStringAttr(AtomAttr):
     def __init__(self, vals, guessed=False):
         self._guessed = guessed
       
@@ -538,6 +529,17 @@ def set_atoms(self, ag, values):
             self.name_lookup = np.concatenate([self.name_lookup, newnames])
         self.values = self.name_lookup[self.nmidx]
 
+
+# TODO: update docs to property doc
+class Atomnames(_AtomStringAttr):
+    """Name for each atom.
+    """
+    attrname = 'names'
+    singular = 'name'
+    per_object = 'atom'
+    dtype = object
+    transplants = defaultdict(list)
+
     def phi_selection(residue, c_name='C', n_name='N', ca_name='CA'):
         """Select AtomGroup corresponding to the phi protein backbone dihedral
         C'-N-CA-C.
@@ -1011,20 +1013,16 @@ def chi1_selections(residues, n_name='N', ca_name='CA', cb_name='CB',
 
 
 # TODO: update docs to property doc
-class Atomtypes(AtomAttr):
+class Atomtypes(_AtomStringAttr):
     """Type for each atom"""
     attrname = 'types'
     singular = 'type'
     per_object = 'atom'
     dtype = object
 
-    @staticmethod
-    def _gen_initial_values(na, nr, ns):
-        return np.array(['' for _ in range(na)], dtype=object)
-
 
 # TODO: update docs to property doc
-class Elements(AtomAttr):
+class Elements(_AtomStringAttr):
     """Element for each atom"""
     attrname = 'elements'
     singular = 'element'
@@ -1048,7 +1046,7 @@ def _gen_initial_values(na, nr, ns):
         return np.zeros(na)
 
 
-class RecordTypes(AtomAttr):
+class RecordTypes(_AtomStringAttr):
     """For PDB-like formats, indicates if ATOM or HETATM
 
     Defaults to 'ATOM'
@@ -1066,7 +1064,7 @@ def _gen_initial_values(na, nr, ns):
         return np.array(['ATOM'] * na, dtype=object)
 
 
-class ChainIDs(AtomAttr):
+class ChainIDs(_AtomStringAttr):
     """ChainID per atom
 
     Note
@@ -1078,10 +1076,6 @@ class ChainIDs(AtomAttr):
     per_object = 'atom'
     dtype = object
 
-    @staticmethod
-    def _gen_initial_values(na, nr, ns):
-        return np.array(['' for _ in range(na)], dtype=object)
-
 
 class Tempfactors(AtomAttr):
     """Tempfactor for atoms"""
@@ -1627,7 +1621,7 @@ def _gen_initial_values(na, nr, ns):
 
 
 # TODO: update docs to property doc
-class AltLocs(AtomAttr):
+class AltLocs(_AtomStringAttr):
     """AltLocs for each atom"""
     attrname = 'altLocs'
     singular = 'altLoc'
@@ -1781,8 +1775,65 @@ def _gen_initial_values(na, nr, ns):
         return np.arange(1, nr + 1)
 
 
+class _ResidueStringAttr(ResidueAttr):
+    def __init__(self, vals, guessed=False):
+        self._guessed = guessed
+      
+        self.namedict = dict()  # maps str to nmidx
+        name_lookup = []  # maps idx to str
+        # eg namedict['O'] = 5 & name_lookup[5] = 'O'
+
+        self.nmidx = np.zeros_like(vals, dtype=int)  # the lookup for each atom
+        # eg Atom 5 is 'C', so nmidx[5] = 7, where name_lookup[7] = 'C'
+
+        for i, val in enumerate(vals):
+            try:
+                self.nmidx[i] = self.namedict[val]
+            except KeyError:
+                nextidx = len(self.namedict)
+                self.namedict[val] = nextidx
+                name_lookup.append(val)
+
+                self.nmidx[i] = nextidx
+
+        self.name_lookup = np.array(name_lookup, dtype=object)
+        self.values = self.name_lookup[self.nmidx]    
+
+    @staticmethod
+    def _gen_initial_values(na, nr, ns):
+        return np.array(['' for _ in range(nr)], dtype=object)
+
+    @_check_length
+    def set_residues(self, rg, values):
+        newnames = []
+
+        # two possibilities, either single value given, or one per Atom
+        if isinstance(values, str):
+            try:
+                newidx = self.namedict[values]
+            except KeyError:
+                newidx = len(self.namedict)
+                self.namedict[values] = newidx
+                newnames.append(values)
+        else:
+            newidx = np.zeros_like(values, dtype=int)
+            for i, val in enumerate(values):
+                try:
+                    newidx[i] = self.namedict[val]
+                except KeyError:
+                    nextidx = len(self.namedict)
+                    self.namedict[val] = nextidx
+                    newnames.append(val)
+                    newidx[i] = nextidx
+
+        self.nmidx[rg.ix] = newidx  # newidx either single value or same size array
+        if newnames:
+            self.name_lookup = np.concatenate([self.name_lookup, newnames])
+        self.values = self.name_lookup[self.nmidx]    
+
+
 # TODO: update docs to property doc
-class Resnames(ResidueAttr):
+class Resnames(_ResidueStringAttr):
     attrname = 'resnames'
     singular = 'resname'
     target_classes = [AtomGroup, ResidueGroup, SegmentGroup, Atom, Residue]
@@ -1903,18 +1954,14 @@ def _gen_initial_values(na, nr, ns):
         return np.arange(1, nr + 1)
 
 
-class ICodes(ResidueAttr):
+class ICodes(_ResidueStringAttr):
     """Insertion code for Atoms"""
     attrname = 'icodes'
     singular = 'icode'
     dtype = object
 
-    @staticmethod
-    def _gen_initial_values(na, nr, ns):
-        return np.array(['' for _ in range(nr)], dtype=object)
-
 
-class Moltypes(ResidueAttr):
+class Moltypes(_ResidueStringAttr):
     """Name of the molecule type
 
     Two molecules that share a molecule type share a common template topology.
@@ -1969,8 +2016,65 @@ def set_segments(self, sg, values):
         self.values[sg.ix] = values
 
 
+class _SegmentStringAttr(SegmentAttr):
+    def __init__(self, vals, guessed=False):
+        self._guessed = guessed
+      
+        self.namedict = dict()  # maps str to nmidx
+        name_lookup = []  # maps idx to str
+        # eg namedict['O'] = 5 & name_lookup[5] = 'O'
+
+        self.nmidx = np.zeros_like(vals, dtype=int)  # the lookup for each atom
+        # eg Atom 5 is 'C', so nmidx[5] = 7, where name_lookup[7] = 'C'
+
+        for i, val in enumerate(vals):
+            try:
+                self.nmidx[i] = self.namedict[val]
+            except KeyError:
+                nextidx = len(self.namedict)
+                self.namedict[val] = nextidx
+                name_lookup.append(val)
+
+                self.nmidx[i] = nextidx
+
+        self.name_lookup = np.array(name_lookup, dtype=object)
+        self.values = self.name_lookup[self.nmidx]    
+
+    @staticmethod
+    def _gen_initial_values(na, nr, ns):
+        return np.array(['' for _ in range(nr)], dtype=object)
+
+    @_check_length
+    def set_segments(self, sg, values):
+        newnames = []
+
+        # two possibilities, either single value given, or one per Atom
+        if isinstance(values, str):
+            try:
+                newidx = self.namedict[values]
+            except KeyError:
+                newidx = len(self.namedict)
+                self.namedict[values] = newidx
+                newnames.append(values)
+        else:
+            newidx = np.zeros_like(values, dtype=int)
+            for i, val in enumerate(values):
+                try:
+                    newidx[i] = self.namedict[val]
+                except KeyError:
+                    nextidx = len(self.namedict)
+                    self.namedict[val] = nextidx
+                    newnames.append(val)
+                    newidx[i] = nextidx
+
+        self.nmidx[sg.ix] = newidx  # newidx either single value or same size array
+        if newnames:
+            self.name_lookup = np.concatenate([self.name_lookup, newnames])
+        self.values = self.name_lookup[self.nmidx]    
+
+        
 # TODO: update docs to property doc
-class Segids(SegmentAttr):
+class Segids(_SegmentStringAttr):
     attrname = 'segids'
     singular = 'segid'
     target_classes = [AtomGroup, ResidueGroup, SegmentGroup,