From 8a29e83cf33f59f0d0507b26acd8f4d71dbb0dae Mon Sep 17 00:00:00 2001
From: Richard Gowers <richardjgowers@gmail.com>
Date: Tue, 25 Aug 2020 14:35:33 +0100
Subject: [PATCH] Faster name selections (#2755)

* fix #2751
* Python 2.7/3 backport of PR #2755 (use six.str_types, update CHANGELOG and versionchanged as 1.0.1)
* modified AtomNames topologyattr to include lookup table index
* rework atom name selection to use lookup tables
* fixed test supplying integer as atom name
* Update test_topologyattrs.py
* use dict-lookup string attrs EVERYWHERERE
* made protein selection faster, 48ms -> 0.5ms on GRO testfile
* improved nucleic/backbone selections
* Added explicit tests for Resnames topologyattr
  tests now provide str types for resnames/icodes
* use fnmatchcase to be case sensitive (this was a small unreported bug in 1.0.0: the matching
  was done case-insensitive)

Co-authored-by: Irfan Alibay <IAlibay@users.noreply.github.com>
Co-authored-by: Oliver Beckstein <orbeckst@gmail.com>

(cherry picked from commit 45e56e8314c278e3eb98ed7a6029b74e7435e8be)
---
 package/CHANGELOG                             |   2 +
 package/MDAnalysis/core/selection.py          | 179 +++++++++---
 package/MDAnalysis/core/topologyattrs.py      | 257 ++++++++++++++----
 .../core/test_atomselections.py               |   2 +-
 .../MDAnalysisTests/core/test_segmentgroup.py |  18 ++
 .../core/test_topologyattrs.py                |  36 ++-
 6 files changed, 396 insertions(+), 98 deletions(-)

diff --git a/package/CHANGELOG b/package/CHANGELOG
index 132c0738e6f..fe2dc564b0b 100644
--- a/package/CHANGELOG
+++ b/package/CHANGELOG
@@ -38,12 +38,14 @@ Fixes
     empty AtomGroup (Issue #2848)
   * Fixed reading in masses and charges from a hoomdxml file
     (Issue #2888, PR #2889)
+  * Fixed performance regression on select_atoms for string selections (#2751)
   * Fixed the DMSParser, allowing the creation of multiple segids sharing
     residues with identical resids (Issue #1387, PR #2872)
 
 Enhancements
   * Improved performances when parsing TPR files (PR #2804)
 
+
 Deprecations
   * waterdynamics.HydrogenBondLifetimes will be removed in 2.0.0 and
     replaced with hydrogenbonds.HydrogenBondAnalysis.lifetime() (#2547)
diff --git a/package/MDAnalysis/core/selection.py b/package/MDAnalysis/core/selection.py
index 9cda3023848..35dc3227e13 100644
--- a/package/MDAnalysis/core/selection.py
+++ b/package/MDAnalysis/core/selection.py
@@ -301,7 +301,7 @@ def __init__(self, parser, tokens):
         self.inRadius = float(tokens.popleft())
         self.exRadius = float(tokens.popleft())
         self.sel = parser.parse_expression(self.precedence)
-    
+
     @return_empty_on_apply
     def apply(self, group):
         indices = []
@@ -520,7 +520,7 @@ def apply(self, group):
         return group[mask]
 
 
-class StringSelection(Selection):
+class _ProtoStringSelection(Selection):
     """Selections based on text attributes
 
     .. versionchanged:: 1.0.0
@@ -535,11 +535,23 @@ def __init__(self, parser, tokens):
 
     @return_empty_on_apply
     def apply(self, group):
-        mask = np.zeros(len(group), dtype=np.bool)
-        for val in self.values:
-            values = getattr(group, self.field)
-            mask |= [fnmatch.fnmatch(x, val) for x in values]
-        return group[mask].unique
+        # rather than work on group.names, cheat and look at the lookup table
+        nmattr = getattr(group.universe._topology, self.field)
+
+        matches = []  # list of passing indices
+        # iterate through set of known atom names, check which pass
+        for nm, ix in nmattr.namedict.items():
+            if any(fnmatch.fnmatchcase(nm, val) for val in self.values):
+                matches.append(ix)
+
+        # atomname indices for members of this group
+        nmidx = nmattr.nmidx[getattr(group, self.level)]
+
+        return group[np.in1d(nmidx, matches)].unique
+
+
+class StringSelection(_ProtoStringSelection):
+    level = 'ix'  # operates on atom level attribute, i.e. '.ix'
 
 
 class AtomNameSelection(StringSelection):
@@ -566,22 +578,27 @@ class AtomICodeSelection(StringSelection):
     field = 'icodes'
 
 
-class ResidueNameSelection(StringSelection):
+class _ResidueStringSelection(_ProtoStringSelection):
+    level= 'resindices'
+
+
+class ResidueNameSelection(_ResidueStringSelection):
     """Select atoms based on 'resnames' attribute"""
     token = 'resname'
     field = 'resnames'
 
 
-class MoleculeTypeSelection(StringSelection):
+class MoleculeTypeSelection(_ResidueStringSelection):
     """Select atoms based on 'moltypes' attribute"""
     token = 'moltype'
     field = 'moltypes'
 
 
-class SegmentNameSelection(StringSelection):
+class SegmentNameSelection(_ProtoStringSelection):
     """Select atoms based on 'segids' attribute"""
     token = 'segid'
     field = 'segids'
+    level = 'segindices'
 
 
 class AltlocSelection(StringSelection):
@@ -791,10 +808,15 @@ class ProteinSelection(Selection):
     See Also
     --------
     :func:`MDAnalysis.lib.util.convert_aa_code`
+
+
+    .. versionchanged:: 1.0.1
+       prot_res changed to set (from numpy array)
+       performance improved by ~100x on larger systems
     """
     token = 'protein'
 
-    prot_res = np.array([
+    prot_res = {
         # CHARMM top_all27_prot_lipid.rtf
         'ALA', 'ARG', 'ASN', 'ASP', 'CYS', 'GLN', 'GLU', 'GLY', 'HSD',
         'HSE', 'HSP', 'ILE', 'LEU', 'LYS', 'MET', 'PHE', 'PRO', 'SER', 'THR',
@@ -817,14 +839,20 @@ class ProteinSelection(Selection):
         'CLEU', 'CILE', 'CVAL', 'CASF', 'CASN', 'CGLN', 'CARG', 'CHID', 'CHIE',
         'CHIP', 'CTRP', 'CPHE', 'CTYR', 'CGLU', 'CASP', 'CLYS', 'CPRO', 'CCYS',
         'CCYX', 'CMET', 'CME', 'ASF',
-    ])
+    }
 
     def __init__(self, parser, tokens):
         pass
 
     def apply(self, group):
-        mask = np.in1d(group.resnames, self.prot_res)
-        return group[mask].unique
+        resname_attr = group.universe._topology.resnames
+        # which values in resname attr are in prot_res?
+        matches = [ix for (nm, ix) in resname_attr.namedict.items()
+                   if nm in self.prot_res]
+        # index of each atom's resname
+        nmidx = resname_attr.nmidx[group.resindices]
+        # intersect atom's resname index and matches to prot_res
+        return group[np.in1d(nmidx, matches)].unique
 
 
 class NucleicSelection(Selection):
@@ -839,23 +867,32 @@ class NucleicSelection(Selection):
 
     .. versionchanged:: 0.8
        additional Gromacs selections
+    .. versionchanged:: 1.0.1
+       nucl_res changed to set (from numpy array)
+       performance improved by ~100x on larger systems
     """
     token = 'nucleic'
 
-    nucl_res = np.array([
+    nucl_res = {
         'ADE', 'URA', 'CYT', 'GUA', 'THY', 'DA', 'DC', 'DG', 'DT', 'RA',
         'RU', 'RG', 'RC', 'A', 'T', 'U', 'C', 'G',
         'DA5', 'DC5', 'DG5', 'DT5',
         'DA3', 'DC3', 'DG3', 'DT3',
         'RA5', 'RU5', 'RG5', 'RC5',
         'RA3', 'RU3', 'RG3', 'RC3'
-    ])
+    }
 
     def __init__(self, parser, tokens):
         pass
 
     def apply(self, group):
-        mask = np.in1d(group.resnames, self.nucl_res)
+        resnames = group.universe._topology.resnames
+        nmidx = resnames.nmidx[group.resindices]
+
+        matches = [ix for (nm, ix) in resnames.namedict.items()
+                   if nm in self.nucl_res]
+        mask = np.in1d(nmidx, matches)
+
         return group[mask].unique
 
 
@@ -864,14 +901,32 @@ class BackboneSelection(ProteinSelection):
 
     This excludes OT* on C-termini
     (which are included by, eg VMD's backbone selection).
+
+
+    .. versionchanged:: 1.0.1
+       bb_atoms changed to set (from numpy array)
+       performance improved by ~100x on larger systems
     """
     token = 'backbone'
-    bb_atoms = np.array(['N', 'CA', 'C', 'O'])
+    bb_atoms = {'N', 'CA', 'C', 'O'}
 
     def apply(self, group):
-        mask = np.in1d(group.names, self.bb_atoms)
-        mask &= np.in1d(group.resnames, self.prot_res)
-        return group[mask].unique
+        atomnames = group.universe._topology.names
+        resnames = group.universe._topology.resnames
+
+        # filter by atom names
+        name_matches = [ix for (nm, ix) in atomnames.namedict.items()
+                        if nm in self.bb_atoms]
+        nmidx = atomnames.nmidx[group.ix]
+        group = group[np.in1d(nmidx, name_matches)]
+
+        # filter by resnames
+        resname_matches = [ix for (nm, ix) in resnames.namedict.items()
+                           if nm in self.prot_res]
+        nmidx = resnames.nmidx[group.resindices]
+        group = group[np.in1d(nmidx, resname_matches)]
+
+        return group.unique
 
 
 class NucleicBackboneSelection(NucleicSelection):
@@ -879,14 +934,32 @@ class NucleicBackboneSelection(NucleicSelection):
 
     These atoms are only recognized if they are in a residue matched
     by the :class:`NucleicSelection`.
+
+
+    .. versionchanged:: 1.0.1
+       bb_atoms changed to set (from numpy array)
+       performance improved by ~100x on larger systems
     """
     token = 'nucleicbackbone'
-    bb_atoms = np.array(["P", "C5'", "C3'", "O3'", "O5'"])
+    bb_atoms = {"P", "C5'", "C3'", "O3'", "O5'"}
 
     def apply(self, group):
-        mask = np.in1d(group.names, self.bb_atoms)
-        mask &= np.in1d(group.resnames, self.nucl_res)
-        return group[mask].unique
+        atomnames = group.universe._topology.names
+        resnames = group.universe._topology.resnames
+
+        # filter by atom names
+        name_matches = [ix for (nm, ix) in atomnames.namedict.items()
+                        if nm in self.bb_atoms]
+        nmidx = atomnames.nmidx[group.ix]
+        group = group[np.in1d(nmidx, name_matches)]
+
+        # filter by resnames
+        resname_matches = [ix for (nm, ix) in resnames.namedict.items()
+                           if nm in self.nucl_res]
+        nmidx = resnames.nmidx[group.resindices]
+        group = group[np.in1d(nmidx, resname_matches)]
+
+        return group.unique
 
 
 class BaseSelection(NucleicSelection):
@@ -896,29 +969,65 @@ class BaseSelection(NucleicSelection):
 
      'N9', 'N7', 'C8', 'C5', 'C4', 'N3', 'C2', 'N1', 'C6',
      'O6','N2','N6', 'O2','N4','O4','C5M'
+
+
+    .. versionchanged:: 1.0.1
+       base_atoms changed to set (from numpy array)
+       performance improved by ~100x on larger systems
     """
     token = 'nucleicbase'
-    base_atoms = np.array([
+    base_atoms = {
         'N9', 'N7', 'C8', 'C5', 'C4', 'N3', 'C2', 'N1', 'C6',
         'O6', 'N2', 'N6',
-        'O2', 'N4', 'O4', 'C5M'])
+        'O2', 'N4', 'O4', 'C5M'}
 
     def apply(self, group):
-        mask = np.in1d(group.names, self.base_atoms)
-        mask &= np.in1d(group.resnames, self.nucl_res)
-        return group[mask].unique
+        atomnames = group.universe._topology.names
+        resnames = group.universe._topology.resnames
+
+        # filter by atom names
+        name_matches = [ix for (nm, ix) in atomnames.namedict.items()
+                        if nm in self.base_atoms]
+        nmidx = atomnames.nmidx[group.ix]
+        group = group[np.in1d(nmidx, name_matches)]
+
+        # filter by resnames
+        resname_matches = [ix for (nm, ix) in resnames.namedict.items()
+                           if nm in self.nucl_res]
+        nmidx = resnames.nmidx[group.resindices]
+        group = group[np.in1d(nmidx, resname_matches)]
+
+        return group.unique
 
 
 class NucleicSugarSelection(NucleicSelection):
     """Contains all atoms with name C1', C2', C3', C4', O2', O4', O3'.
+
+
+    .. versionchanged:: 1.0.1
+       sug_atoms changed to set (from numpy array)
+       performance improved by ~100x on larger systems
     """
     token = 'nucleicsugar'
-    sug_atoms = np.array(["C1'", "C2'", "C3'", "C4'", "O4'"])
+    sug_atoms = {"C1'", "C2'", "C3'", "C4'", "O4'"}
 
     def apply(self, group):
-        mask = np.in1d(group.names, self.sug_atoms)
-        mask &= np.in1d(group.resnames, self.nucl_res)
-        return group[mask].unique
+        atomnames = group.universe._topology.names
+        resnames = group.universe._topology.resnames
+
+        # filter by atom names
+        name_matches = [ix for (nm, ix) in atomnames.namedict.items()
+                        if nm in self.sug_atoms]
+        nmidx = atomnames.nmidx[group.ix]
+        group = group[np.in1d(nmidx, name_matches)]
+
+        # filter by resnames
+        resname_matches = [ix for (nm, ix) in resnames.namedict.items()
+                           if nm in self.nucl_res]
+        nmidx = resnames.nmidx[group.resindices]
+        group = group[np.in1d(nmidx, resname_matches)]
+
+        return group.unique
 
 
 class PropertySelection(Selection):
@@ -1035,7 +1144,7 @@ class SameSelection(Selection):
     Selects all atoms that have the same subkeyword value as any atom in selection
 
     .. versionchanged:: 1.0.0
-       Map :code:`"residue"` to :code:`"resindices"` and :code:`"segment"` to 
+       Map :code:`"residue"` to :code:`"resindices"` and :code:`"segment"` to
        :code:`"segindices"` (see #2669 and #2672)
     """
 
diff --git a/package/MDAnalysis/core/topologyattrs.py b/package/MDAnalysis/core/topologyattrs.py
index 60ce5e9ad51..553e8d71704 100644
--- a/package/MDAnalysis/core/topologyattrs.py
+++ b/package/MDAnalysis/core/topologyattrs.py
@@ -477,8 +477,65 @@ def _gen_initial_values(na, nr, ns):
         return np.arange(1, na + 1)
 
 
+class _AtomStringAttr(AtomAttr):
+    def __init__(self, vals, guessed=False):
+        self._guessed = guessed
+
+        self.namedict = dict()  # maps str to nmidx
+        name_lookup = []  # maps idx to str
+        # eg namedict['O'] = 5 & name_lookup[5] = 'O'
+
+        self.nmidx = np.zeros_like(vals, dtype=int)  # the lookup for each atom
+        # eg Atom 5 is 'C', so nmidx[5] = 7, where name_lookup[7] = 'C'
+
+        for i, val in enumerate(vals):
+            try:
+                self.nmidx[i] = self.namedict[val]
+            except KeyError:
+                nextidx = len(self.namedict)
+                self.namedict[val] = nextidx
+                name_lookup.append(val)
+
+                self.nmidx[i] = nextidx
+
+        self.name_lookup = np.array(name_lookup, dtype=object)
+        self.values = self.name_lookup[self.nmidx]
+
+    @staticmethod
+    def _gen_initial_values(na, nr, ns):
+        return np.array(['' for _ in range(na)], dtype=object)
+
+    @_check_length
+    def set_atoms(self, ag, values):
+        newnames = []
+
+        # two possibilities, either single value given, or one per Atom
+        if isinstance(values, six.str_types):
+            try:
+                newidx = self.namedict[values]
+            except KeyError:
+                newidx = len(self.namedict)
+                self.namedict[values] = newidx
+                newnames.append(values)
+        else:
+            newidx = np.zeros_like(values, dtype=int)
+            for i, val in enumerate(values):
+                try:
+                    newidx[i] = self.namedict[val]
+                except KeyError:
+                    nextidx = len(self.namedict)
+                    self.namedict[val] = nextidx
+                    newnames.append(val)
+                    newidx[i] = nextidx
+
+        self.nmidx[ag.ix] = newidx  # newidx either single value or same size array
+        if newnames:
+            self.name_lookup = np.concatenate([self.name_lookup, newnames])
+        self.values = self.name_lookup[self.nmidx]
+
+
 # TODO: update docs to property doc
-class Atomnames(AtomAttr):
+class Atomnames(_AtomStringAttr):
     """Name for each atom.
     """
     attrname = 'names'
@@ -487,10 +544,6 @@ class Atomnames(AtomAttr):
     dtype = object
     transplants = defaultdict(list)
 
-    @staticmethod
-    def _gen_initial_values(na, nr, ns):
-        return np.array(['' for _ in range(na)], dtype=object)
-
     def phi_selection(residue, c_name='C', n_name='N', ca_name='CA'):
         """Select AtomGroup corresponding to the phi protein backbone dihedral
         C'-N-CA-C.
@@ -540,7 +593,7 @@ def phi_selection(residue, c_name='C', n_name='N', ca_name='CA'):
     transplants[Residue].append(('phi_selection', phi_selection))
 
     def phi_selections(residues, c_name='C', n_name='N', ca_name='CA'):
-        """Select list of AtomGroups corresponding to the phi protein 
+        """Select list of AtomGroups corresponding to the phi protein
         backbone dihedral C'-N-CA-C.
 
         Parameters
@@ -580,9 +633,9 @@ def phi_selections(residues, c_name='C', n_name='N', ca_name='CA'):
                 except IndexError:
                     invalid.append(i)
             prev = sum(prevls)
-        
+
         keep_prev = [sum(r.atoms.names == c_name) == 1 for r in prev]
-        keep_res = [all(sum(r.atoms.names == n) == 1 for n in ncac_names) 
+        keep_res = [all(sum(r.atoms.names == n) == 1 for n in ncac_names)
                     for r in residues]
         keep = np.array(keep_prev) & np.array(keep_res)
         keep[invalid] = False
@@ -591,7 +644,7 @@ def phi_selections(residues, c_name='C', n_name='N', ca_name='CA'):
         prev = prev[keep]
         residues = residues[keep]
         keepix = np.where(keep)[0]
-        
+
         c_ = prev.atoms[prev.atoms.names == c_name]
         n = residues.atoms[residues.atoms.names == n_name]
         ca = residues.atoms[residues.atoms.names == ca_name]
@@ -660,13 +713,13 @@ def psi_selection(residue, c_name='C', n_name='N', ca_name='CA'):
     transplants[Residue].append(('psi_selection', psi_selection))
 
     def _get_next_residues_by_resid(residues):
-        """Select list of Residues corresponding to the next resid for each 
+        """Select list of Residues corresponding to the next resid for each
         residue in `residues`.
 
         Returns
         -------
         List of Residues
-            List of the next residues in the Universe, by resid and segid. 
+            List of the next residues in the Universe, by resid and segid.
             If not found, the corresponding item in the list is ``None``.
 
         .. versionadded:: 1.0.0
@@ -700,13 +753,13 @@ def _get_next_residues_by_resid(residues):
                                       _get_next_residues_by_resid))
 
     def _get_prev_residues_by_resid(residues):
-        """Select list of Residues corresponding to the previous resid for each 
+        """Select list of Residues corresponding to the previous resid for each
         residue in `residues`.
 
         Returns
         -------
         List of Residues
-            List of the previous residues in the Universe, by resid and segid. 
+            List of the previous residues in the Universe, by resid and segid.
             If not found, the corresponding item in the list is ``None``.
 
         .. versionadded:: 1.0.0
@@ -728,12 +781,12 @@ def _get_prev_residues_by_resid(residues):
                     pvres[i] = None
         return pvres
 
-    
+
     transplants[ResidueGroup].append(('_get_prev_residues_by_resid',
                                       _get_prev_residues_by_resid))
 
     def psi_selections(residues, c_name='C', n_name='N', ca_name='CA'):
-        """Select list of AtomGroups corresponding to the psi protein 
+        """Select list of AtomGroups corresponding to the psi protein
         backbone dihedral N-CA-C-N'.
 
         Parameters
@@ -749,7 +802,7 @@ def psi_selections(residues, c_name='C', n_name='N', ca_name='CA'):
         -------
         List of AtomGroups
             4-atom selections in the correct order. If no N' found in the
-            following residue (by resid) then the corresponding item in the 
+            following residue (by resid) then the corresponding item in the
             list is ``None``.
 
         .. versionadded:: 1.0.0
@@ -762,13 +815,13 @@ def psi_selections(residues, c_name='C', n_name='N', ca_name='CA'):
         ncac_names = [n_name, ca_name, c_name]
 
         keep_nxt = [sum(r.atoms.names == n_name) == 1 for r in nxt]
-        keep_res = [all(sum(r.atoms.names == n) == 1 for n in ncac_names) 
+        keep_res = [all(sum(r.atoms.names == n) == 1 for n in ncac_names)
                     for r in residues]
         keep = np.array(keep_nxt) & np.array(keep_res)
         nxt = nxt[keep]
         residues = residues[keep]
         keepix = np.where(keep)[0]
-        
+
         n = residues.atoms[residues.atoms.names == n_name]
         ca = residues.atoms[residues.atoms.names == ca_name]
         c = residues.atoms[residues.atoms.names == c_name]
@@ -837,7 +890,7 @@ def omega_selection(residue, c_name='C', n_name='N', ca_name='CA'):
     transplants[Residue].append(('omega_selection', omega_selection))
 
     def omega_selections(residues, c_name='C', n_name='N', ca_name='CA'):
-        """Select list of AtomGroups corresponding to the omega protein 
+        """Select list of AtomGroups corresponding to the omega protein
         backbone dihedral CA-C-N'-CA'.
 
         omega describes the -C-N- peptide bond. Typically, it is trans (180
@@ -857,7 +910,7 @@ def omega_selections(residues, c_name='C', n_name='N', ca_name='CA'):
         -------
         List of AtomGroups
             4-atom selections in the correct order. If no C' found in the
-            previous residue (by resid) then the corresponding item in the 
+            previous residue (by resid) then the corresponding item in the
             list is ``None``.
 
         .. versionadded:: 1.0.0
@@ -867,18 +920,18 @@ def omega_selections(residues, c_name='C', n_name='N', ca_name='CA'):
         rix = np.where(nxtres)[0]
         nxt = sum(nxtres[rix])
         residues = residues[rix]
-        
+
         nxtatoms = [ca_name, n_name]
         resatoms = [ca_name, c_name]
-        keep_nxt = [all(sum(r.atoms.names == n) == 1 for n in nxtatoms) 
+        keep_nxt = [all(sum(r.atoms.names == n) == 1 for n in nxtatoms)
                     for r in nxt]
-        keep_res = [all(sum(r.atoms.names == n) == 1 for n in resatoms) 
+        keep_res = [all(sum(r.atoms.names == n) == 1 for n in resatoms)
                     for r in residues]
         keep = np.array(keep_nxt) & np.array(keep_res)
         nxt = nxt[keep]
         residues = residues[keep]
         keepix = np.where(keep)[0]
-        
+
         c = residues.atoms[residues.atoms.names == c_name]
         ca = residues.atoms[residues.atoms.names == ca_name]
         n_ = nxt.atoms[nxt.atoms.names == n_name]
@@ -889,7 +942,7 @@ def omega_selections(residues, c_name='C', n_name='N', ca_name='CA'):
 
     transplants[ResidueGroup].append(('omega_selections', omega_selections))
 
-    def chi1_selection(residue, n_name='N', ca_name='CA', cb_name='CB', 
+    def chi1_selection(residue, n_name='N', ca_name='CA', cb_name='CB',
                        cg_name='CG'):
         """Select AtomGroup corresponding to the chi1 sidechain dihedral N-CA-CB-CG.
 
@@ -924,9 +977,9 @@ def chi1_selection(residue, n_name='N', ca_name='CA', cb_name='CB',
 
     transplants[Residue].append(('chi1_selection', chi1_selection))
 
-    def chi1_selections(residues, n_name='N', ca_name='CA', cb_name='CB', 
+    def chi1_selections(residues, n_name='N', ca_name='CA', cb_name='CB',
                         cg_name='CG'):
-        """Select list of AtomGroups corresponding to the chi1 sidechain dihedral 
+        """Select list of AtomGroups corresponding to the chi1 sidechain dihedral
         N-CA-CB-CG.
 
         Parameters
@@ -950,7 +1003,7 @@ def chi1_selections(residues, n_name='N', ca_name='CA', cb_name='CB',
         """
         results = np.array([None]*len(residues))
         names = [n_name, ca_name, cb_name, cg_name]
-        keep = [all(sum(r.atoms.names == n) == 1 for n in names) 
+        keep = [all(sum(r.atoms.names == n) == 1 for n in names)
                 for r in residues]
         keepix = np.where(keep)[0]
         residues = residues[keep]
@@ -959,25 +1012,21 @@ def chi1_selections(residues, n_name='N', ca_name='CA', cb_name='CB',
         ags = [residues.atoms[atnames == n] for n in names]
         results[keepix] = [sum(atoms) for atoms in zip(*ags)]
         return list(results)
-    
+
     transplants[ResidueGroup].append(('chi1_selections', chi1_selections))
 
 
 # TODO: update docs to property doc
-class Atomtypes(AtomAttr):
+class Atomtypes(_AtomStringAttr):
     """Type for each atom"""
     attrname = 'types'
     singular = 'type'
     per_object = 'atom'
     dtype = object
 
-    @staticmethod
-    def _gen_initial_values(na, nr, ns):
-        return np.array(['' for _ in range(na)], dtype=object)
-
 
 # TODO: update docs to property doc
-class Elements(AtomAttr):
+class Elements(_AtomStringAttr):
     """Element for each atom"""
     attrname = 'elements'
     singular = 'element'
@@ -1001,7 +1050,7 @@ def _gen_initial_values(na, nr, ns):
         return np.zeros(na)
 
 
-class RecordTypes(AtomAttr):
+class RecordTypes(_AtomStringAttr):
     """For PDB-like formats, indicates if ATOM or HETATM
 
     Defaults to 'ATOM'
@@ -1019,7 +1068,7 @@ def _gen_initial_values(na, nr, ns):
         return np.array(['ATOM'] * na, dtype=object)
 
 
-class ChainIDs(AtomAttr):
+class ChainIDs(_AtomStringAttr):
     """ChainID per atom
 
     Note
@@ -1031,10 +1080,6 @@ class ChainIDs(AtomAttr):
     per_object = 'atom'
     dtype = object
 
-    @staticmethod
-    def _gen_initial_values(na, nr, ns):
-        return np.array(['' for _ in range(na)], dtype=object)
-
 
 class Tempfactors(AtomAttr):
     """Tempfactor for atoms"""
@@ -1428,7 +1473,7 @@ def principal_axes(group, pbc=False):
 
 
         .. versionchanged:: 0.8 Added *pbc* keyword
-        .. versionchanged:: 1.0.0 
+        .. versionchanged:: 1.0.0
             Always return principal axes in right-hand convention.
 
         """
@@ -1580,7 +1625,7 @@ def _gen_initial_values(na, nr, ns):
 
 
 # TODO: update docs to property doc
-class AltLocs(AtomAttr):
+class AltLocs(_AtomStringAttr):
     """AltLocs for each atom"""
     attrname = 'altLocs'
     singular = 'altLoc'
@@ -1722,8 +1767,65 @@ def _gen_initial_values(na, nr, ns):
         return np.arange(1, nr + 1)
 
 
+class _ResidueStringAttr(ResidueAttr):
+    def __init__(self, vals, guessed=False):
+        self._guessed = guessed
+
+        self.namedict = dict()  # maps str to nmidx
+        name_lookup = []  # maps idx to str
+        # eg namedict['O'] = 5 & name_lookup[5] = 'O'
+
+        self.nmidx = np.zeros_like(vals, dtype=int)  # the lookup for each atom
+        # eg Atom 5 is 'C', so nmidx[5] = 7, where name_lookup[7] = 'C'
+
+        for i, val in enumerate(vals):
+            try:
+                self.nmidx[i] = self.namedict[val]
+            except KeyError:
+                nextidx = len(self.namedict)
+                self.namedict[val] = nextidx
+                name_lookup.append(val)
+
+                self.nmidx[i] = nextidx
+
+        self.name_lookup = np.array(name_lookup, dtype=object)
+        self.values = self.name_lookup[self.nmidx]
+
+    @staticmethod
+    def _gen_initial_values(na, nr, ns):
+        return np.array(['' for _ in range(nr)], dtype=object)
+
+    @_check_length
+    def set_residues(self, rg, values):
+        newnames = []
+
+        # two possibilities, either single value given, or one per Atom
+        if isinstance(values, six.str_types):
+            try:
+                newidx = self.namedict[values]
+            except KeyError:
+                newidx = len(self.namedict)
+                self.namedict[values] = newidx
+                newnames.append(values)
+        else:
+            newidx = np.zeros_like(values, dtype=int)
+            for i, val in enumerate(values):
+                try:
+                    newidx[i] = self.namedict[val]
+                except KeyError:
+                    nextidx = len(self.namedict)
+                    self.namedict[val] = nextidx
+                    newnames.append(val)
+                    newidx[i] = nextidx
+
+        self.nmidx[rg.ix] = newidx  # newidx either single value or same size array
+        if newnames:
+            self.name_lookup = np.concatenate([self.name_lookup, newnames])
+        self.values = self.name_lookup[self.nmidx]
+
+
 # TODO: update docs to property doc
-class Resnames(ResidueAttr):
+class Resnames(_ResidueStringAttr):
     attrname = 'resnames'
     singular = 'resname'
     target_classes = [AtomGroup, ResidueGroup, SegmentGroup, Atom, Residue]
@@ -1844,18 +1946,14 @@ def _gen_initial_values(na, nr, ns):
         return np.arange(1, nr + 1)
 
 
-class ICodes(ResidueAttr):
+class ICodes(_ResidueStringAttr):
     """Insertion code for Atoms"""
     attrname = 'icodes'
     singular = 'icode'
     dtype = object
 
-    @staticmethod
-    def _gen_initial_values(na, nr, ns):
-        return np.array(['' for _ in range(nr)], dtype=object)
-
 
-class Moltypes(ResidueAttr):
+class Moltypes(_ResidueStringAttr):
     """Name of the molecule type
 
     Two molecules that share a molecule type share a common template topology.
@@ -1910,8 +2008,65 @@ def set_segments(self, sg, values):
         self.values[sg.ix] = values
 
 
+class _SegmentStringAttr(SegmentAttr):
+    def __init__(self, vals, guessed=False):
+        self._guessed = guessed
+
+        self.namedict = dict()  # maps str to nmidx
+        name_lookup = []  # maps idx to str
+        # eg namedict['O'] = 5 & name_lookup[5] = 'O'
+
+        self.nmidx = np.zeros_like(vals, dtype=int)  # the lookup for each atom
+        # eg Atom 5 is 'C', so nmidx[5] = 7, where name_lookup[7] = 'C'
+
+        for i, val in enumerate(vals):
+            try:
+                self.nmidx[i] = self.namedict[val]
+            except KeyError:
+                nextidx = len(self.namedict)
+                self.namedict[val] = nextidx
+                name_lookup.append(val)
+
+                self.nmidx[i] = nextidx
+
+        self.name_lookup = np.array(name_lookup, dtype=object)
+        self.values = self.name_lookup[self.nmidx]
+
+    @staticmethod
+    def _gen_initial_values(na, nr, ns):
+        return np.array(['' for _ in range(nr)], dtype=object)
+
+    @_check_length
+    def set_segments(self, sg, values):
+        newnames = []
+
+        # two possibilities, either single value given, or one per Atom
+        if isinstance(values, six.str_types):
+            try:
+                newidx = self.namedict[values]
+            except KeyError:
+                newidx = len(self.namedict)
+                self.namedict[values] = newidx
+                newnames.append(values)
+        else:
+            newidx = np.zeros_like(values, dtype=int)
+            for i, val in enumerate(values):
+                try:
+                    newidx[i] = self.namedict[val]
+                except KeyError:
+                    nextidx = len(self.namedict)
+                    self.namedict[val] = nextidx
+                    newnames.append(val)
+                    newidx[i] = nextidx
+
+        self.nmidx[sg.ix] = newidx  # newidx either single value or same size array
+        if newnames:
+            self.name_lookup = np.concatenate([self.name_lookup, newnames])
+        self.values = self.name_lookup[self.nmidx]
+
+
 # TODO: update docs to property doc
-class Segids(SegmentAttr):
+class Segids(_SegmentStringAttr):
     attrname = 'segids'
     singular = 'segid'
     target_classes = [AtomGroup, ResidueGroup, SegmentGroup,
diff --git a/testsuite/MDAnalysisTests/core/test_atomselections.py b/testsuite/MDAnalysisTests/core/test_atomselections.py
index c4d0ce16c0c..904002ed304 100644
--- a/testsuite/MDAnalysisTests/core/test_atomselections.py
+++ b/testsuite/MDAnalysisTests/core/test_atomselections.py
@@ -78,7 +78,7 @@ def test_protein(self, universe):
                      sorted(universe.select_atoms('segid 4AKE').indices),
                      "selected protein is not the same as auto-generated protein segment s4AKE")
 
-    @pytest.mark.parametrize('resname', MDAnalysis.core.selection.ProteinSelection.prot_res)
+    @pytest.mark.parametrize('resname', sorted(MDAnalysis.core.selection.ProteinSelection.prot_res))
     def test_protein_resnames(self, resname):
         u = make_Universe(('resnames',))
         # set half the residues' names to the resname we're testing
diff --git a/testsuite/MDAnalysisTests/core/test_segmentgroup.py b/testsuite/MDAnalysisTests/core/test_segmentgroup.py
index fb0e97748bf..b138594ee36 100644
--- a/testsuite/MDAnalysisTests/core/test_segmentgroup.py
+++ b/testsuite/MDAnalysisTests/core/test_segmentgroup.py
@@ -90,6 +90,24 @@ def test_set_segid_updates_(universe):
                  err_msg="old selection was not changed in place after set_segid")
 
 
+def test_set_segids_many():
+    u = mda.Universe.empty(n_atoms=6, n_residues=2, n_segments=2,
+                           atom_resindex=[0, 0, 0, 1, 1, 1], residue_segindex=[0,1])
+    u.add_TopologyAttr('segids', ['A', 'B'])
+
+    # universe with 2 segments, A and B
+
+    u.segments.segids = ['X', 'Y']
+
+    assert u.segments[0].segid == 'X'
+    assert u.segments[1].segid == 'Y'
+
+    assert len(u.select_atoms('segid A')) == 0
+    assert len(u.select_atoms('segid B')) == 0
+    assert len(u.select_atoms('segid X')) == 3
+    assert len(u.select_atoms('segid Y')) == 3
+
+
 def test_atom_order(universe):
     assert_equal(universe.segments.atoms.indices,
                  sorted(universe.segments.atoms.indices))
diff --git a/testsuite/MDAnalysisTests/core/test_topologyattrs.py b/testsuite/MDAnalysisTests/core/test_topologyattrs.py
index 4c865a6a794..87ea18f70de 100644
--- a/testsuite/MDAnalysisTests/core/test_topologyattrs.py
+++ b/testsuite/MDAnalysisTests/core/test_topologyattrs.py
@@ -91,6 +91,7 @@ class TestAtomAttr(TopologyAttrMixin):
 
     """
     values = np.array([7, 3, 69, 9993, 84, 194, 263, 501, 109, 5873])
+    single_value = 567
     attrclass = tpattrs.AtomAttr
 
     def test_set_atom_VE(self):
@@ -110,8 +111,9 @@ def test_get_atoms(self, attr):
     def test_set_atoms_singular(self, attr):
         # set len 2 Group to len 1 value
         dg = DummyGroup([3, 7])
-        attr.set_atoms(dg, 567)
-        assert_equal(attr.get_atoms(dg), np.array([567, 567]))
+        attr.set_atoms(dg, self.single_value)
+        assert_equal(attr.get_atoms(dg),
+                     np.array([self.single_value, self.single_value]))
 
     def test_set_atoms_plural(self, attr):
         # set len 2 Group to len 2 values
@@ -173,6 +175,7 @@ def test_cant_set_segment_indices(self, u):
 class TestAtomnames(TestAtomAttr):
     values = np.array(['O', 'C', 'CA', 'N', 'CB', 'CG', 'CD', 'NA', 'CL', 'OW'],
                       dtype=np.object)
+    single_value = 'Ca2'
     attrclass = tpattrs.Atomnames
 
 
@@ -204,29 +207,30 @@ class TestResidueAttr(TopologyAttrMixin):
     """Test residue-level TopologyAttrs.
 
     """
+    single_value = 2
     values = np.array([15.2, 395.6, 0.1, 9.8])
     attrclass = tpattrs.ResidueAttr
 
-    def test_set_residue_VE(self):
-        u = make_Universe(('resnames',))
-        res = u.residues[0]
+    def test_set_residue_VE(self, universe):
+        # setting e.g. resname to 2 values should fail with VE
+        res = universe.residues[0]
         with pytest.raises(ValueError):
-            setattr(res, 'resname', ['wrong', 'length'])
+            setattr(res, self.attrclass.singular, self.values[:2])
 
     def test_get_atoms(self, attr):
         assert_equal(attr.get_atoms(DummyGroup([7, 3, 9])),
-                           self.values[[3, 2, 2]])
+                     self.values[[3, 2, 2]])
 
     def test_get_residues(self, attr):
         assert_equal(attr.get_residues(DummyGroup([1, 2, 1, 3])),
-                           self.values[[1, 2, 1, 3]])
+                     self.values[[1, 2, 1, 3]])
 
     def test_set_residues_singular(self, attr):
         dg = DummyGroup([3, 0, 1])
-        attr.set_residues(dg, 2)
+        attr.set_residues(dg, self.single_value)
 
-        assert_almost_equal(attr.get_residues(dg),
-                                  np.array([2, 2, 2]))
+        assert_equal(attr.get_residues(dg),
+                     np.array([self.single_value]*3, dtype=self.values.dtype))
 
     def test_set_residues_plural(self, attr):
         attr.set_residues(DummyGroup([3, 0, 1]),
@@ -249,6 +253,16 @@ def test_get_segments(self, attr):
                            [self.values[[0, 3]], self.values[[1, 2]], self.values[[1, 2]]])
 
 
+class TestResnames(TestResidueAttr):
+    attrclass = tpattrs.Resnames
+    single_value = 'xyz'
+    values = np.array(['a', 'b', '', 'd'], dtype=object)
+
+
+class TestICodes(TestResnames):
+    attrclass = tpattrs.ICodes
+
+
 class TestResids(TestResidueAttr):
     values = np.array([10, 11, 18, 20])
     attrclass = tpattrs.Resids