From 8a29e83cf33f59f0d0507b26acd8f4d71dbb0dae Mon Sep 17 00:00:00 2001 From: Richard Gowers Date: Tue, 25 Aug 2020 14:35:33 +0100 Subject: [PATCH] Faster name selections (#2755) * fix #2751 * Python 2.7/3 backport of PR #2755 (use six.str_types, update CHANGELOG and versionchanged as 1.0.1) * modified AtomNames topologyattr to include lookup table index * rework atom name selection to use lookup tables * fixed test supplying integer as atom name * Update test_topologyattrs.py * use dict-lookup string attrs EVERYWHERERE * made protein selection faster, 48ms -> 0.5ms on GRO testfile * improved nucleic/backbone selections * Added explicit tests for Resnames topologyattr tests now provide str types for resnames/icodes * use fnmatchcase to be case sensitive (this was a small unreported bug in 1.0.0: the matching was done case-insensitive) Co-authored-by: Irfan Alibay Co-authored-by: Oliver Beckstein (cherry picked from commit 45e56e8314c278e3eb98ed7a6029b74e7435e8be) --- package/CHANGELOG | 2 + package/MDAnalysis/core/selection.py | 179 +++++++++--- package/MDAnalysis/core/topologyattrs.py | 257 ++++++++++++++---- .../core/test_atomselections.py | 2 +- .../MDAnalysisTests/core/test_segmentgroup.py | 18 ++ .../core/test_topologyattrs.py | 36 ++- 6 files changed, 396 insertions(+), 98 deletions(-) diff --git a/package/CHANGELOG b/package/CHANGELOG index 132c0738e6f..fe2dc564b0b 100644 --- a/package/CHANGELOG +++ b/package/CHANGELOG @@ -38,12 +38,14 @@ Fixes empty AtomGroup (Issue #2848) * Fixed reading in masses and charges from a hoomdxml file (Issue #2888, PR #2889) + * Fixed performance regression on select_atoms for string selections (#2751) * Fixed the DMSParser, allowing the creation of multiple segids sharing residues with identical resids (Issue #1387, PR #2872) Enhancements * Improved performances when parsing TPR files (PR #2804) + Deprecations * waterdynamics.HydrogenBondLifetimes will be removed in 2.0.0 and replaced with hydrogenbonds.HydrogenBondAnalysis.lifetime() (#2547) diff --git a/package/MDAnalysis/core/selection.py b/package/MDAnalysis/core/selection.py index 9cda3023848..35dc3227e13 100644 --- a/package/MDAnalysis/core/selection.py +++ b/package/MDAnalysis/core/selection.py @@ -301,7 +301,7 @@ def __init__(self, parser, tokens): self.inRadius = float(tokens.popleft()) self.exRadius = float(tokens.popleft()) self.sel = parser.parse_expression(self.precedence) - + @return_empty_on_apply def apply(self, group): indices = [] @@ -520,7 +520,7 @@ def apply(self, group): return group[mask] -class StringSelection(Selection): +class _ProtoStringSelection(Selection): """Selections based on text attributes .. versionchanged:: 1.0.0 @@ -535,11 +535,23 @@ def __init__(self, parser, tokens): @return_empty_on_apply def apply(self, group): - mask = np.zeros(len(group), dtype=np.bool) - for val in self.values: - values = getattr(group, self.field) - mask |= [fnmatch.fnmatch(x, val) for x in values] - return group[mask].unique + # rather than work on group.names, cheat and look at the lookup table + nmattr = getattr(group.universe._topology, self.field) + + matches = [] # list of passing indices + # iterate through set of known atom names, check which pass + for nm, ix in nmattr.namedict.items(): + if any(fnmatch.fnmatchcase(nm, val) for val in self.values): + matches.append(ix) + + # atomname indices for members of this group + nmidx = nmattr.nmidx[getattr(group, self.level)] + + return group[np.in1d(nmidx, matches)].unique + + +class StringSelection(_ProtoStringSelection): + level = 'ix' # operates on atom level attribute, i.e. '.ix' class AtomNameSelection(StringSelection): @@ -566,22 +578,27 @@ class AtomICodeSelection(StringSelection): field = 'icodes' -class ResidueNameSelection(StringSelection): +class _ResidueStringSelection(_ProtoStringSelection): + level= 'resindices' + + +class ResidueNameSelection(_ResidueStringSelection): """Select atoms based on 'resnames' attribute""" token = 'resname' field = 'resnames' -class MoleculeTypeSelection(StringSelection): +class MoleculeTypeSelection(_ResidueStringSelection): """Select atoms based on 'moltypes' attribute""" token = 'moltype' field = 'moltypes' -class SegmentNameSelection(StringSelection): +class SegmentNameSelection(_ProtoStringSelection): """Select atoms based on 'segids' attribute""" token = 'segid' field = 'segids' + level = 'segindices' class AltlocSelection(StringSelection): @@ -791,10 +808,15 @@ class ProteinSelection(Selection): See Also -------- :func:`MDAnalysis.lib.util.convert_aa_code` + + + .. versionchanged:: 1.0.1 + prot_res changed to set (from numpy array) + performance improved by ~100x on larger systems """ token = 'protein' - prot_res = np.array([ + prot_res = { # CHARMM top_all27_prot_lipid.rtf 'ALA', 'ARG', 'ASN', 'ASP', 'CYS', 'GLN', 'GLU', 'GLY', 'HSD', 'HSE', 'HSP', 'ILE', 'LEU', 'LYS', 'MET', 'PHE', 'PRO', 'SER', 'THR', @@ -817,14 +839,20 @@ class ProteinSelection(Selection): 'CLEU', 'CILE', 'CVAL', 'CASF', 'CASN', 'CGLN', 'CARG', 'CHID', 'CHIE', 'CHIP', 'CTRP', 'CPHE', 'CTYR', 'CGLU', 'CASP', 'CLYS', 'CPRO', 'CCYS', 'CCYX', 'CMET', 'CME', 'ASF', - ]) + } def __init__(self, parser, tokens): pass def apply(self, group): - mask = np.in1d(group.resnames, self.prot_res) - return group[mask].unique + resname_attr = group.universe._topology.resnames + # which values in resname attr are in prot_res? + matches = [ix for (nm, ix) in resname_attr.namedict.items() + if nm in self.prot_res] + # index of each atom's resname + nmidx = resname_attr.nmidx[group.resindices] + # intersect atom's resname index and matches to prot_res + return group[np.in1d(nmidx, matches)].unique class NucleicSelection(Selection): @@ -839,23 +867,32 @@ class NucleicSelection(Selection): .. versionchanged:: 0.8 additional Gromacs selections + .. versionchanged:: 1.0.1 + nucl_res changed to set (from numpy array) + performance improved by ~100x on larger systems """ token = 'nucleic' - nucl_res = np.array([ + nucl_res = { 'ADE', 'URA', 'CYT', 'GUA', 'THY', 'DA', 'DC', 'DG', 'DT', 'RA', 'RU', 'RG', 'RC', 'A', 'T', 'U', 'C', 'G', 'DA5', 'DC5', 'DG5', 'DT5', 'DA3', 'DC3', 'DG3', 'DT3', 'RA5', 'RU5', 'RG5', 'RC5', 'RA3', 'RU3', 'RG3', 'RC3' - ]) + } def __init__(self, parser, tokens): pass def apply(self, group): - mask = np.in1d(group.resnames, self.nucl_res) + resnames = group.universe._topology.resnames + nmidx = resnames.nmidx[group.resindices] + + matches = [ix for (nm, ix) in resnames.namedict.items() + if nm in self.nucl_res] + mask = np.in1d(nmidx, matches) + return group[mask].unique @@ -864,14 +901,32 @@ class BackboneSelection(ProteinSelection): This excludes OT* on C-termini (which are included by, eg VMD's backbone selection). + + + .. versionchanged:: 1.0.1 + bb_atoms changed to set (from numpy array) + performance improved by ~100x on larger systems """ token = 'backbone' - bb_atoms = np.array(['N', 'CA', 'C', 'O']) + bb_atoms = {'N', 'CA', 'C', 'O'} def apply(self, group): - mask = np.in1d(group.names, self.bb_atoms) - mask &= np.in1d(group.resnames, self.prot_res) - return group[mask].unique + atomnames = group.universe._topology.names + resnames = group.universe._topology.resnames + + # filter by atom names + name_matches = [ix for (nm, ix) in atomnames.namedict.items() + if nm in self.bb_atoms] + nmidx = atomnames.nmidx[group.ix] + group = group[np.in1d(nmidx, name_matches)] + + # filter by resnames + resname_matches = [ix for (nm, ix) in resnames.namedict.items() + if nm in self.prot_res] + nmidx = resnames.nmidx[group.resindices] + group = group[np.in1d(nmidx, resname_matches)] + + return group.unique class NucleicBackboneSelection(NucleicSelection): @@ -879,14 +934,32 @@ class NucleicBackboneSelection(NucleicSelection): These atoms are only recognized if they are in a residue matched by the :class:`NucleicSelection`. + + + .. versionchanged:: 1.0.1 + bb_atoms changed to set (from numpy array) + performance improved by ~100x on larger systems """ token = 'nucleicbackbone' - bb_atoms = np.array(["P", "C5'", "C3'", "O3'", "O5'"]) + bb_atoms = {"P", "C5'", "C3'", "O3'", "O5'"} def apply(self, group): - mask = np.in1d(group.names, self.bb_atoms) - mask &= np.in1d(group.resnames, self.nucl_res) - return group[mask].unique + atomnames = group.universe._topology.names + resnames = group.universe._topology.resnames + + # filter by atom names + name_matches = [ix for (nm, ix) in atomnames.namedict.items() + if nm in self.bb_atoms] + nmidx = atomnames.nmidx[group.ix] + group = group[np.in1d(nmidx, name_matches)] + + # filter by resnames + resname_matches = [ix for (nm, ix) in resnames.namedict.items() + if nm in self.nucl_res] + nmidx = resnames.nmidx[group.resindices] + group = group[np.in1d(nmidx, resname_matches)] + + return group.unique class BaseSelection(NucleicSelection): @@ -896,29 +969,65 @@ class BaseSelection(NucleicSelection): 'N9', 'N7', 'C8', 'C5', 'C4', 'N3', 'C2', 'N1', 'C6', 'O6','N2','N6', 'O2','N4','O4','C5M' + + + .. versionchanged:: 1.0.1 + base_atoms changed to set (from numpy array) + performance improved by ~100x on larger systems """ token = 'nucleicbase' - base_atoms = np.array([ + base_atoms = { 'N9', 'N7', 'C8', 'C5', 'C4', 'N3', 'C2', 'N1', 'C6', 'O6', 'N2', 'N6', - 'O2', 'N4', 'O4', 'C5M']) + 'O2', 'N4', 'O4', 'C5M'} def apply(self, group): - mask = np.in1d(group.names, self.base_atoms) - mask &= np.in1d(group.resnames, self.nucl_res) - return group[mask].unique + atomnames = group.universe._topology.names + resnames = group.universe._topology.resnames + + # filter by atom names + name_matches = [ix for (nm, ix) in atomnames.namedict.items() + if nm in self.base_atoms] + nmidx = atomnames.nmidx[group.ix] + group = group[np.in1d(nmidx, name_matches)] + + # filter by resnames + resname_matches = [ix for (nm, ix) in resnames.namedict.items() + if nm in self.nucl_res] + nmidx = resnames.nmidx[group.resindices] + group = group[np.in1d(nmidx, resname_matches)] + + return group.unique class NucleicSugarSelection(NucleicSelection): """Contains all atoms with name C1', C2', C3', C4', O2', O4', O3'. + + + .. versionchanged:: 1.0.1 + sug_atoms changed to set (from numpy array) + performance improved by ~100x on larger systems """ token = 'nucleicsugar' - sug_atoms = np.array(["C1'", "C2'", "C3'", "C4'", "O4'"]) + sug_atoms = {"C1'", "C2'", "C3'", "C4'", "O4'"} def apply(self, group): - mask = np.in1d(group.names, self.sug_atoms) - mask &= np.in1d(group.resnames, self.nucl_res) - return group[mask].unique + atomnames = group.universe._topology.names + resnames = group.universe._topology.resnames + + # filter by atom names + name_matches = [ix for (nm, ix) in atomnames.namedict.items() + if nm in self.sug_atoms] + nmidx = atomnames.nmidx[group.ix] + group = group[np.in1d(nmidx, name_matches)] + + # filter by resnames + resname_matches = [ix for (nm, ix) in resnames.namedict.items() + if nm in self.nucl_res] + nmidx = resnames.nmidx[group.resindices] + group = group[np.in1d(nmidx, resname_matches)] + + return group.unique class PropertySelection(Selection): @@ -1035,7 +1144,7 @@ class SameSelection(Selection): Selects all atoms that have the same subkeyword value as any atom in selection .. versionchanged:: 1.0.0 - Map :code:`"residue"` to :code:`"resindices"` and :code:`"segment"` to + Map :code:`"residue"` to :code:`"resindices"` and :code:`"segment"` to :code:`"segindices"` (see #2669 and #2672) """ diff --git a/package/MDAnalysis/core/topologyattrs.py b/package/MDAnalysis/core/topologyattrs.py index 60ce5e9ad51..553e8d71704 100644 --- a/package/MDAnalysis/core/topologyattrs.py +++ b/package/MDAnalysis/core/topologyattrs.py @@ -477,8 +477,65 @@ def _gen_initial_values(na, nr, ns): return np.arange(1, na + 1) +class _AtomStringAttr(AtomAttr): + def __init__(self, vals, guessed=False): + self._guessed = guessed + + self.namedict = dict() # maps str to nmidx + name_lookup = [] # maps idx to str + # eg namedict['O'] = 5 & name_lookup[5] = 'O' + + self.nmidx = np.zeros_like(vals, dtype=int) # the lookup for each atom + # eg Atom 5 is 'C', so nmidx[5] = 7, where name_lookup[7] = 'C' + + for i, val in enumerate(vals): + try: + self.nmidx[i] = self.namedict[val] + except KeyError: + nextidx = len(self.namedict) + self.namedict[val] = nextidx + name_lookup.append(val) + + self.nmidx[i] = nextidx + + self.name_lookup = np.array(name_lookup, dtype=object) + self.values = self.name_lookup[self.nmidx] + + @staticmethod + def _gen_initial_values(na, nr, ns): + return np.array(['' for _ in range(na)], dtype=object) + + @_check_length + def set_atoms(self, ag, values): + newnames = [] + + # two possibilities, either single value given, or one per Atom + if isinstance(values, six.str_types): + try: + newidx = self.namedict[values] + except KeyError: + newidx = len(self.namedict) + self.namedict[values] = newidx + newnames.append(values) + else: + newidx = np.zeros_like(values, dtype=int) + for i, val in enumerate(values): + try: + newidx[i] = self.namedict[val] + except KeyError: + nextidx = len(self.namedict) + self.namedict[val] = nextidx + newnames.append(val) + newidx[i] = nextidx + + self.nmidx[ag.ix] = newidx # newidx either single value or same size array + if newnames: + self.name_lookup = np.concatenate([self.name_lookup, newnames]) + self.values = self.name_lookup[self.nmidx] + + # TODO: update docs to property doc -class Atomnames(AtomAttr): +class Atomnames(_AtomStringAttr): """Name for each atom. """ attrname = 'names' @@ -487,10 +544,6 @@ class Atomnames(AtomAttr): dtype = object transplants = defaultdict(list) - @staticmethod - def _gen_initial_values(na, nr, ns): - return np.array(['' for _ in range(na)], dtype=object) - def phi_selection(residue, c_name='C', n_name='N', ca_name='CA'): """Select AtomGroup corresponding to the phi protein backbone dihedral C'-N-CA-C. @@ -540,7 +593,7 @@ def phi_selection(residue, c_name='C', n_name='N', ca_name='CA'): transplants[Residue].append(('phi_selection', phi_selection)) def phi_selections(residues, c_name='C', n_name='N', ca_name='CA'): - """Select list of AtomGroups corresponding to the phi protein + """Select list of AtomGroups corresponding to the phi protein backbone dihedral C'-N-CA-C. Parameters @@ -580,9 +633,9 @@ def phi_selections(residues, c_name='C', n_name='N', ca_name='CA'): except IndexError: invalid.append(i) prev = sum(prevls) - + keep_prev = [sum(r.atoms.names == c_name) == 1 for r in prev] - keep_res = [all(sum(r.atoms.names == n) == 1 for n in ncac_names) + keep_res = [all(sum(r.atoms.names == n) == 1 for n in ncac_names) for r in residues] keep = np.array(keep_prev) & np.array(keep_res) keep[invalid] = False @@ -591,7 +644,7 @@ def phi_selections(residues, c_name='C', n_name='N', ca_name='CA'): prev = prev[keep] residues = residues[keep] keepix = np.where(keep)[0] - + c_ = prev.atoms[prev.atoms.names == c_name] n = residues.atoms[residues.atoms.names == n_name] ca = residues.atoms[residues.atoms.names == ca_name] @@ -660,13 +713,13 @@ def psi_selection(residue, c_name='C', n_name='N', ca_name='CA'): transplants[Residue].append(('psi_selection', psi_selection)) def _get_next_residues_by_resid(residues): - """Select list of Residues corresponding to the next resid for each + """Select list of Residues corresponding to the next resid for each residue in `residues`. Returns ------- List of Residues - List of the next residues in the Universe, by resid and segid. + List of the next residues in the Universe, by resid and segid. If not found, the corresponding item in the list is ``None``. .. versionadded:: 1.0.0 @@ -700,13 +753,13 @@ def _get_next_residues_by_resid(residues): _get_next_residues_by_resid)) def _get_prev_residues_by_resid(residues): - """Select list of Residues corresponding to the previous resid for each + """Select list of Residues corresponding to the previous resid for each residue in `residues`. Returns ------- List of Residues - List of the previous residues in the Universe, by resid and segid. + List of the previous residues in the Universe, by resid and segid. If not found, the corresponding item in the list is ``None``. .. versionadded:: 1.0.0 @@ -728,12 +781,12 @@ def _get_prev_residues_by_resid(residues): pvres[i] = None return pvres - + transplants[ResidueGroup].append(('_get_prev_residues_by_resid', _get_prev_residues_by_resid)) def psi_selections(residues, c_name='C', n_name='N', ca_name='CA'): - """Select list of AtomGroups corresponding to the psi protein + """Select list of AtomGroups corresponding to the psi protein backbone dihedral N-CA-C-N'. Parameters @@ -749,7 +802,7 @@ def psi_selections(residues, c_name='C', n_name='N', ca_name='CA'): ------- List of AtomGroups 4-atom selections in the correct order. If no N' found in the - following residue (by resid) then the corresponding item in the + following residue (by resid) then the corresponding item in the list is ``None``. .. versionadded:: 1.0.0 @@ -762,13 +815,13 @@ def psi_selections(residues, c_name='C', n_name='N', ca_name='CA'): ncac_names = [n_name, ca_name, c_name] keep_nxt = [sum(r.atoms.names == n_name) == 1 for r in nxt] - keep_res = [all(sum(r.atoms.names == n) == 1 for n in ncac_names) + keep_res = [all(sum(r.atoms.names == n) == 1 for n in ncac_names) for r in residues] keep = np.array(keep_nxt) & np.array(keep_res) nxt = nxt[keep] residues = residues[keep] keepix = np.where(keep)[0] - + n = residues.atoms[residues.atoms.names == n_name] ca = residues.atoms[residues.atoms.names == ca_name] c = residues.atoms[residues.atoms.names == c_name] @@ -837,7 +890,7 @@ def omega_selection(residue, c_name='C', n_name='N', ca_name='CA'): transplants[Residue].append(('omega_selection', omega_selection)) def omega_selections(residues, c_name='C', n_name='N', ca_name='CA'): - """Select list of AtomGroups corresponding to the omega protein + """Select list of AtomGroups corresponding to the omega protein backbone dihedral CA-C-N'-CA'. omega describes the -C-N- peptide bond. Typically, it is trans (180 @@ -857,7 +910,7 @@ def omega_selections(residues, c_name='C', n_name='N', ca_name='CA'): ------- List of AtomGroups 4-atom selections in the correct order. If no C' found in the - previous residue (by resid) then the corresponding item in the + previous residue (by resid) then the corresponding item in the list is ``None``. .. versionadded:: 1.0.0 @@ -867,18 +920,18 @@ def omega_selections(residues, c_name='C', n_name='N', ca_name='CA'): rix = np.where(nxtres)[0] nxt = sum(nxtres[rix]) residues = residues[rix] - + nxtatoms = [ca_name, n_name] resatoms = [ca_name, c_name] - keep_nxt = [all(sum(r.atoms.names == n) == 1 for n in nxtatoms) + keep_nxt = [all(sum(r.atoms.names == n) == 1 for n in nxtatoms) for r in nxt] - keep_res = [all(sum(r.atoms.names == n) == 1 for n in resatoms) + keep_res = [all(sum(r.atoms.names == n) == 1 for n in resatoms) for r in residues] keep = np.array(keep_nxt) & np.array(keep_res) nxt = nxt[keep] residues = residues[keep] keepix = np.where(keep)[0] - + c = residues.atoms[residues.atoms.names == c_name] ca = residues.atoms[residues.atoms.names == ca_name] n_ = nxt.atoms[nxt.atoms.names == n_name] @@ -889,7 +942,7 @@ def omega_selections(residues, c_name='C', n_name='N', ca_name='CA'): transplants[ResidueGroup].append(('omega_selections', omega_selections)) - def chi1_selection(residue, n_name='N', ca_name='CA', cb_name='CB', + def chi1_selection(residue, n_name='N', ca_name='CA', cb_name='CB', cg_name='CG'): """Select AtomGroup corresponding to the chi1 sidechain dihedral N-CA-CB-CG. @@ -924,9 +977,9 @@ def chi1_selection(residue, n_name='N', ca_name='CA', cb_name='CB', transplants[Residue].append(('chi1_selection', chi1_selection)) - def chi1_selections(residues, n_name='N', ca_name='CA', cb_name='CB', + def chi1_selections(residues, n_name='N', ca_name='CA', cb_name='CB', cg_name='CG'): - """Select list of AtomGroups corresponding to the chi1 sidechain dihedral + """Select list of AtomGroups corresponding to the chi1 sidechain dihedral N-CA-CB-CG. Parameters @@ -950,7 +1003,7 @@ def chi1_selections(residues, n_name='N', ca_name='CA', cb_name='CB', """ results = np.array([None]*len(residues)) names = [n_name, ca_name, cb_name, cg_name] - keep = [all(sum(r.atoms.names == n) == 1 for n in names) + keep = [all(sum(r.atoms.names == n) == 1 for n in names) for r in residues] keepix = np.where(keep)[0] residues = residues[keep] @@ -959,25 +1012,21 @@ def chi1_selections(residues, n_name='N', ca_name='CA', cb_name='CB', ags = [residues.atoms[atnames == n] for n in names] results[keepix] = [sum(atoms) for atoms in zip(*ags)] return list(results) - + transplants[ResidueGroup].append(('chi1_selections', chi1_selections)) # TODO: update docs to property doc -class Atomtypes(AtomAttr): +class Atomtypes(_AtomStringAttr): """Type for each atom""" attrname = 'types' singular = 'type' per_object = 'atom' dtype = object - @staticmethod - def _gen_initial_values(na, nr, ns): - return np.array(['' for _ in range(na)], dtype=object) - # TODO: update docs to property doc -class Elements(AtomAttr): +class Elements(_AtomStringAttr): """Element for each atom""" attrname = 'elements' singular = 'element' @@ -1001,7 +1050,7 @@ def _gen_initial_values(na, nr, ns): return np.zeros(na) -class RecordTypes(AtomAttr): +class RecordTypes(_AtomStringAttr): """For PDB-like formats, indicates if ATOM or HETATM Defaults to 'ATOM' @@ -1019,7 +1068,7 @@ def _gen_initial_values(na, nr, ns): return np.array(['ATOM'] * na, dtype=object) -class ChainIDs(AtomAttr): +class ChainIDs(_AtomStringAttr): """ChainID per atom Note @@ -1031,10 +1080,6 @@ class ChainIDs(AtomAttr): per_object = 'atom' dtype = object - @staticmethod - def _gen_initial_values(na, nr, ns): - return np.array(['' for _ in range(na)], dtype=object) - class Tempfactors(AtomAttr): """Tempfactor for atoms""" @@ -1428,7 +1473,7 @@ def principal_axes(group, pbc=False): .. versionchanged:: 0.8 Added *pbc* keyword - .. versionchanged:: 1.0.0 + .. versionchanged:: 1.0.0 Always return principal axes in right-hand convention. """ @@ -1580,7 +1625,7 @@ def _gen_initial_values(na, nr, ns): # TODO: update docs to property doc -class AltLocs(AtomAttr): +class AltLocs(_AtomStringAttr): """AltLocs for each atom""" attrname = 'altLocs' singular = 'altLoc' @@ -1722,8 +1767,65 @@ def _gen_initial_values(na, nr, ns): return np.arange(1, nr + 1) +class _ResidueStringAttr(ResidueAttr): + def __init__(self, vals, guessed=False): + self._guessed = guessed + + self.namedict = dict() # maps str to nmidx + name_lookup = [] # maps idx to str + # eg namedict['O'] = 5 & name_lookup[5] = 'O' + + self.nmidx = np.zeros_like(vals, dtype=int) # the lookup for each atom + # eg Atom 5 is 'C', so nmidx[5] = 7, where name_lookup[7] = 'C' + + for i, val in enumerate(vals): + try: + self.nmidx[i] = self.namedict[val] + except KeyError: + nextidx = len(self.namedict) + self.namedict[val] = nextidx + name_lookup.append(val) + + self.nmidx[i] = nextidx + + self.name_lookup = np.array(name_lookup, dtype=object) + self.values = self.name_lookup[self.nmidx] + + @staticmethod + def _gen_initial_values(na, nr, ns): + return np.array(['' for _ in range(nr)], dtype=object) + + @_check_length + def set_residues(self, rg, values): + newnames = [] + + # two possibilities, either single value given, or one per Atom + if isinstance(values, six.str_types): + try: + newidx = self.namedict[values] + except KeyError: + newidx = len(self.namedict) + self.namedict[values] = newidx + newnames.append(values) + else: + newidx = np.zeros_like(values, dtype=int) + for i, val in enumerate(values): + try: + newidx[i] = self.namedict[val] + except KeyError: + nextidx = len(self.namedict) + self.namedict[val] = nextidx + newnames.append(val) + newidx[i] = nextidx + + self.nmidx[rg.ix] = newidx # newidx either single value or same size array + if newnames: + self.name_lookup = np.concatenate([self.name_lookup, newnames]) + self.values = self.name_lookup[self.nmidx] + + # TODO: update docs to property doc -class Resnames(ResidueAttr): +class Resnames(_ResidueStringAttr): attrname = 'resnames' singular = 'resname' target_classes = [AtomGroup, ResidueGroup, SegmentGroup, Atom, Residue] @@ -1844,18 +1946,14 @@ def _gen_initial_values(na, nr, ns): return np.arange(1, nr + 1) -class ICodes(ResidueAttr): +class ICodes(_ResidueStringAttr): """Insertion code for Atoms""" attrname = 'icodes' singular = 'icode' dtype = object - @staticmethod - def _gen_initial_values(na, nr, ns): - return np.array(['' for _ in range(nr)], dtype=object) - -class Moltypes(ResidueAttr): +class Moltypes(_ResidueStringAttr): """Name of the molecule type Two molecules that share a molecule type share a common template topology. @@ -1910,8 +2008,65 @@ def set_segments(self, sg, values): self.values[sg.ix] = values +class _SegmentStringAttr(SegmentAttr): + def __init__(self, vals, guessed=False): + self._guessed = guessed + + self.namedict = dict() # maps str to nmidx + name_lookup = [] # maps idx to str + # eg namedict['O'] = 5 & name_lookup[5] = 'O' + + self.nmidx = np.zeros_like(vals, dtype=int) # the lookup for each atom + # eg Atom 5 is 'C', so nmidx[5] = 7, where name_lookup[7] = 'C' + + for i, val in enumerate(vals): + try: + self.nmidx[i] = self.namedict[val] + except KeyError: + nextidx = len(self.namedict) + self.namedict[val] = nextidx + name_lookup.append(val) + + self.nmidx[i] = nextidx + + self.name_lookup = np.array(name_lookup, dtype=object) + self.values = self.name_lookup[self.nmidx] + + @staticmethod + def _gen_initial_values(na, nr, ns): + return np.array(['' for _ in range(nr)], dtype=object) + + @_check_length + def set_segments(self, sg, values): + newnames = [] + + # two possibilities, either single value given, or one per Atom + if isinstance(values, six.str_types): + try: + newidx = self.namedict[values] + except KeyError: + newidx = len(self.namedict) + self.namedict[values] = newidx + newnames.append(values) + else: + newidx = np.zeros_like(values, dtype=int) + for i, val in enumerate(values): + try: + newidx[i] = self.namedict[val] + except KeyError: + nextidx = len(self.namedict) + self.namedict[val] = nextidx + newnames.append(val) + newidx[i] = nextidx + + self.nmidx[sg.ix] = newidx # newidx either single value or same size array + if newnames: + self.name_lookup = np.concatenate([self.name_lookup, newnames]) + self.values = self.name_lookup[self.nmidx] + + # TODO: update docs to property doc -class Segids(SegmentAttr): +class Segids(_SegmentStringAttr): attrname = 'segids' singular = 'segid' target_classes = [AtomGroup, ResidueGroup, SegmentGroup, diff --git a/testsuite/MDAnalysisTests/core/test_atomselections.py b/testsuite/MDAnalysisTests/core/test_atomselections.py index c4d0ce16c0c..904002ed304 100644 --- a/testsuite/MDAnalysisTests/core/test_atomselections.py +++ b/testsuite/MDAnalysisTests/core/test_atomselections.py @@ -78,7 +78,7 @@ def test_protein(self, universe): sorted(universe.select_atoms('segid 4AKE').indices), "selected protein is not the same as auto-generated protein segment s4AKE") - @pytest.mark.parametrize('resname', MDAnalysis.core.selection.ProteinSelection.prot_res) + @pytest.mark.parametrize('resname', sorted(MDAnalysis.core.selection.ProteinSelection.prot_res)) def test_protein_resnames(self, resname): u = make_Universe(('resnames',)) # set half the residues' names to the resname we're testing diff --git a/testsuite/MDAnalysisTests/core/test_segmentgroup.py b/testsuite/MDAnalysisTests/core/test_segmentgroup.py index fb0e97748bf..b138594ee36 100644 --- a/testsuite/MDAnalysisTests/core/test_segmentgroup.py +++ b/testsuite/MDAnalysisTests/core/test_segmentgroup.py @@ -90,6 +90,24 @@ def test_set_segid_updates_(universe): err_msg="old selection was not changed in place after set_segid") +def test_set_segids_many(): + u = mda.Universe.empty(n_atoms=6, n_residues=2, n_segments=2, + atom_resindex=[0, 0, 0, 1, 1, 1], residue_segindex=[0,1]) + u.add_TopologyAttr('segids', ['A', 'B']) + + # universe with 2 segments, A and B + + u.segments.segids = ['X', 'Y'] + + assert u.segments[0].segid == 'X' + assert u.segments[1].segid == 'Y' + + assert len(u.select_atoms('segid A')) == 0 + assert len(u.select_atoms('segid B')) == 0 + assert len(u.select_atoms('segid X')) == 3 + assert len(u.select_atoms('segid Y')) == 3 + + def test_atom_order(universe): assert_equal(universe.segments.atoms.indices, sorted(universe.segments.atoms.indices)) diff --git a/testsuite/MDAnalysisTests/core/test_topologyattrs.py b/testsuite/MDAnalysisTests/core/test_topologyattrs.py index 4c865a6a794..87ea18f70de 100644 --- a/testsuite/MDAnalysisTests/core/test_topologyattrs.py +++ b/testsuite/MDAnalysisTests/core/test_topologyattrs.py @@ -91,6 +91,7 @@ class TestAtomAttr(TopologyAttrMixin): """ values = np.array([7, 3, 69, 9993, 84, 194, 263, 501, 109, 5873]) + single_value = 567 attrclass = tpattrs.AtomAttr def test_set_atom_VE(self): @@ -110,8 +111,9 @@ def test_get_atoms(self, attr): def test_set_atoms_singular(self, attr): # set len 2 Group to len 1 value dg = DummyGroup([3, 7]) - attr.set_atoms(dg, 567) - assert_equal(attr.get_atoms(dg), np.array([567, 567])) + attr.set_atoms(dg, self.single_value) + assert_equal(attr.get_atoms(dg), + np.array([self.single_value, self.single_value])) def test_set_atoms_plural(self, attr): # set len 2 Group to len 2 values @@ -173,6 +175,7 @@ def test_cant_set_segment_indices(self, u): class TestAtomnames(TestAtomAttr): values = np.array(['O', 'C', 'CA', 'N', 'CB', 'CG', 'CD', 'NA', 'CL', 'OW'], dtype=np.object) + single_value = 'Ca2' attrclass = tpattrs.Atomnames @@ -204,29 +207,30 @@ class TestResidueAttr(TopologyAttrMixin): """Test residue-level TopologyAttrs. """ + single_value = 2 values = np.array([15.2, 395.6, 0.1, 9.8]) attrclass = tpattrs.ResidueAttr - def test_set_residue_VE(self): - u = make_Universe(('resnames',)) - res = u.residues[0] + def test_set_residue_VE(self, universe): + # setting e.g. resname to 2 values should fail with VE + res = universe.residues[0] with pytest.raises(ValueError): - setattr(res, 'resname', ['wrong', 'length']) + setattr(res, self.attrclass.singular, self.values[:2]) def test_get_atoms(self, attr): assert_equal(attr.get_atoms(DummyGroup([7, 3, 9])), - self.values[[3, 2, 2]]) + self.values[[3, 2, 2]]) def test_get_residues(self, attr): assert_equal(attr.get_residues(DummyGroup([1, 2, 1, 3])), - self.values[[1, 2, 1, 3]]) + self.values[[1, 2, 1, 3]]) def test_set_residues_singular(self, attr): dg = DummyGroup([3, 0, 1]) - attr.set_residues(dg, 2) + attr.set_residues(dg, self.single_value) - assert_almost_equal(attr.get_residues(dg), - np.array([2, 2, 2])) + assert_equal(attr.get_residues(dg), + np.array([self.single_value]*3, dtype=self.values.dtype)) def test_set_residues_plural(self, attr): attr.set_residues(DummyGroup([3, 0, 1]), @@ -249,6 +253,16 @@ def test_get_segments(self, attr): [self.values[[0, 3]], self.values[[1, 2]], self.values[[1, 2]]]) +class TestResnames(TestResidueAttr): + attrclass = tpattrs.Resnames + single_value = 'xyz' + values = np.array(['a', 'b', '', 'd'], dtype=object) + + +class TestICodes(TestResnames): + attrclass = tpattrs.ICodes + + class TestResids(TestResidueAttr): values = np.array([10, 11, 18, 20]) attrclass = tpattrs.Resids