Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Faster name selections #2755

Merged
merged 18 commits into from
Aug 25, 2020
Merged
Show file tree
Hide file tree
Changes from 12 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions package/CHANGELOG
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ Fixes
* In hydrogenbonds.hbond_analysis.HydrogenbondAnalysis an AttributeError
was thrown when finding D-H pairs via the topology if `hydrogens` was an
empty AtomGroup (Issue #2848)
* Fixed performance regression on select_atoms for string selections (#2751)
* Fixed the DMSParser, allowing the creation of multiple segids sharing
residues with identical resids (Issue #1387, PR #2872)

Expand All @@ -60,6 +61,8 @@ Enhancements
* Added Hydrogen Bond Lifetime keyword "between" (PR #2791)
* Dead code removed from the TPR parser and increased test coverage (PR #2840)
* TPR parser exposes the elements topology attribute (PR #2858, see Issue #2553)
* Improved performance of select_atoms on strings (e.g. name, type, resname) and
richardjgowers marked this conversation as resolved.
Show resolved Hide resolved
'protein' selection (#2751 PR #2755)

Changes
* deprecated NumPy type aliases have been replaced with their actual types
Expand Down
168 changes: 136 additions & 32 deletions package/MDAnalysis/core/selection.py
Original file line number Diff line number Diff line change
Expand Up @@ -515,7 +515,7 @@ def apply(self, group):
return group[mask]


class StringSelection(Selection):
class _ProtoStringSelection(Selection):
"""Selections based on text attributes

.. versionchanged:: 1.0.0
Expand All @@ -530,11 +530,23 @@ def __init__(self, parser, tokens):

@return_empty_on_apply
def apply(self, group):
mask = np.zeros(len(group), dtype=bool)
for val in self.values:
values = getattr(group, self.field)
mask |= [fnmatch.fnmatch(x, val) for x in values]
return group[mask].unique
# rather than work on group.names, cheat and look at the lookup table
nmattr = getattr(group.universe._topology, self.field)

matches = [] # list of passing indices
richardjgowers marked this conversation as resolved.
Show resolved Hide resolved
# iterate through set of known atom names, check which pass
for nm, ix in nmattr.namedict.items():
if any(fnmatch.fnmatchcase(nm, val) for val in self.values):
matches.append(ix)

# atomname indices for members of this group
nmidx = nmattr.nmidx[getattr(group, self.level)]

return group[np.in1d(nmidx, matches)].unique


class StringSelection(_ProtoStringSelection):
level = 'ix' # operates on atom level attribute, i.e. '.ix'


class AtomNameSelection(StringSelection):
Expand All @@ -561,22 +573,27 @@ class AtomICodeSelection(StringSelection):
field = 'icodes'


class ResidueNameSelection(StringSelection):
class _ResidueStringSelection(_ProtoStringSelection):
level= 'resindices'


class ResidueNameSelection(_ResidueStringSelection):
"""Select atoms based on 'resnames' attribute"""
token = 'resname'
field = 'resnames'


class MoleculeTypeSelection(StringSelection):
class MoleculeTypeSelection(_ResidueStringSelection):
"""Select atoms based on 'moltypes' attribute"""
token = 'moltype'
field = 'moltypes'


class SegmentNameSelection(StringSelection):
class SegmentNameSelection(_ProtoStringSelection):
"""Select atoms based on 'segids' attribute"""
token = 'segid'
field = 'segids'
level = 'segindices'


class AltlocSelection(StringSelection):
Expand Down Expand Up @@ -802,10 +819,14 @@ class ProteinSelection(Selection):
See Also
--------
:func:`MDAnalysis.lib.util.convert_aa_code`

.. versionchanged:: 2.0.0
orbeckst marked this conversation as resolved.
Show resolved Hide resolved
prot_res changed to set (from numpy array)
performance improved by ~100x on larger systems
"""
token = 'protein'

prot_res = np.array([
prot_res = {
# CHARMM top_all27_prot_lipid.rtf
'ALA', 'ARG', 'ASN', 'ASP', 'CYS', 'GLN', 'GLU', 'GLY', 'HSD',
'HSE', 'HSP', 'ILE', 'LEU', 'LYS', 'MET', 'PHE', 'PRO', 'SER', 'THR',
Expand All @@ -828,14 +849,20 @@ class ProteinSelection(Selection):
'CLEU', 'CILE', 'CVAL', 'CASF', 'CASN', 'CGLN', 'CARG', 'CHID', 'CHIE',
'CHIP', 'CTRP', 'CPHE', 'CTYR', 'CGLU', 'CASP', 'CLYS', 'CPRO', 'CCYS',
'CCYX', 'CMET', 'CME', 'ASF',
])
}

def __init__(self, parser, tokens):
pass

def apply(self, group):
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

select_atoms('protein') went from 50ms to 0.5ms on GRO

mask = np.in1d(group.resnames, self.prot_res)
return group[mask].unique
resname_attr = group.universe._topology.resnames
# which values in resname attr are in prot_res?
matches = [ix for (nm, ix) in resname_attr.namedict.items()
if nm in self.prot_res]
# index of each atom's resname
nmidx = resname_attr.nmidx[group.resindices]
# intersect atom's resname index and matches to prot_res
return group[np.in1d(nmidx, matches)].unique


class NucleicSelection(Selection):
Expand All @@ -850,23 +877,32 @@ class NucleicSelection(Selection):

.. versionchanged:: 0.8
additional Gromacs selections
.. versionchanged:: 2.0.0
nucl_res changed to set (from numpy array)
performance improved by ~100x on larger systems
"""
token = 'nucleic'

nucl_res = np.array([
nucl_res = {
'ADE', 'URA', 'CYT', 'GUA', 'THY', 'DA', 'DC', 'DG', 'DT', 'RA',
'RU', 'RG', 'RC', 'A', 'T', 'U', 'C', 'G',
'DA5', 'DC5', 'DG5', 'DT5',
'DA3', 'DC3', 'DG3', 'DT3',
'RA5', 'RU5', 'RG5', 'RC5',
'RA3', 'RU3', 'RG3', 'RC3'
])
}

def __init__(self, parser, tokens):
pass

def apply(self, group):
mask = np.in1d(group.resnames, self.nucl_res)
resnames = group.universe._topology.resnames
nmidx = resnames.nmidx[group.resindices]

matches = [ix for (nm, ix) in resnames.namedict.items()
if nm in self.nucl_res]
mask = np.in1d(nmidx, matches)

return group[mask].unique


Expand All @@ -875,29 +911,63 @@ class BackboneSelection(ProteinSelection):

This excludes OT* on C-termini
(which are included by, eg VMD's backbone selection).

orbeckst marked this conversation as resolved.
Show resolved Hide resolved
.. versionchanged:: 2.0.0
bb_atoms changed to set (from numpy array)
performance improved by ~100x on larger systems
"""
token = 'backbone'
bb_atoms = np.array(['N', 'CA', 'C', 'O'])
bb_atoms = {'N', 'CA', 'C', 'O'}

def apply(self, group):
mask = np.in1d(group.names, self.bb_atoms)
mask &= np.in1d(group.resnames, self.prot_res)
return group[mask].unique
atomnames = group.universe._topology.names
resnames = group.universe._topology.resnames

# filter by atom names
name_matches = [ix for (nm, ix) in atomnames.namedict.items()
if nm in self.bb_atoms]
nmidx = atomnames.nmidx[group.ix]
group = group[np.in1d(nmidx, name_matches)]

# filter by resnames
resname_matches = [ix for (nm, ix) in resnames.namedict.items()
if nm in self.prot_res]
nmidx = resnames.nmidx[group.resindices]
group = group[np.in1d(nmidx, resname_matches)]

return group.unique


class NucleicBackboneSelection(NucleicSelection):
"""Contains all atoms with name "P", "C5'", C3'", "O3'", "O5'".

These atoms are only recognized if they are in a residue matched
by the :class:`NucleicSelection`.

orbeckst marked this conversation as resolved.
Show resolved Hide resolved
.. versionchanged:: 2.0.0
bb_atoms changed to set (from numpy array)
performance improved by ~100x on larger systems
"""
token = 'nucleicbackbone'
bb_atoms = np.array(["P", "C5'", "C3'", "O3'", "O5'"])
bb_atoms = {"P", "C5'", "C3'", "O3'", "O5'"}

def apply(self, group):
mask = np.in1d(group.names, self.bb_atoms)
mask &= np.in1d(group.resnames, self.nucl_res)
return group[mask].unique
atomnames = group.universe._topology.names
resnames = group.universe._topology.resnames

# filter by atom names
name_matches = [ix for (nm, ix) in atomnames.namedict.items()
if nm in self.bb_atoms]
nmidx = atomnames.nmidx[group.ix]
group = group[np.in1d(nmidx, name_matches)]

# filter by resnames
resname_matches = [ix for (nm, ix) in resnames.namedict.items()
if nm in self.nucl_res]
nmidx = resnames.nmidx[group.resindices]
group = group[np.in1d(nmidx, resname_matches)]

return group.unique


class BaseSelection(NucleicSelection):
Expand All @@ -907,29 +977,63 @@ class BaseSelection(NucleicSelection):

'N9', 'N7', 'C8', 'C5', 'C4', 'N3', 'C2', 'N1', 'C6',
'O6','N2','N6', 'O2','N4','O4','C5M'

orbeckst marked this conversation as resolved.
Show resolved Hide resolved
.. versionchanged:: 2.0.0
base_atoms changed to set (from numpy array)
performance improved by ~100x on larger systems
"""
token = 'nucleicbase'
base_atoms = np.array([
base_atoms = {
'N9', 'N7', 'C8', 'C5', 'C4', 'N3', 'C2', 'N1', 'C6',
'O6', 'N2', 'N6',
'O2', 'N4', 'O4', 'C5M'])
'O2', 'N4', 'O4', 'C5M'}

def apply(self, group):
mask = np.in1d(group.names, self.base_atoms)
mask &= np.in1d(group.resnames, self.nucl_res)
return group[mask].unique
atomnames = group.universe._topology.names
resnames = group.universe._topology.resnames

# filter by atom names
name_matches = [ix for (nm, ix) in atomnames.namedict.items()
if nm in self.base_atoms]
nmidx = atomnames.nmidx[group.ix]
group = group[np.in1d(nmidx, name_matches)]

# filter by resnames
resname_matches = [ix for (nm, ix) in resnames.namedict.items()
if nm in self.nucl_res]
nmidx = resnames.nmidx[group.resindices]
group = group[np.in1d(nmidx, resname_matches)]

return group.unique


class NucleicSugarSelection(NucleicSelection):
"""Contains all atoms with name C1', C2', C3', C4', O2', O4', O3'.

orbeckst marked this conversation as resolved.
Show resolved Hide resolved
.. versionchanged:: 2.0.0
sug_atoms changed to set (from numpy array)
performance improved by ~100x on larger systems
"""
token = 'nucleicsugar'
sug_atoms = np.array(["C1'", "C2'", "C3'", "C4'", "O4'"])
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Shouldn't this be a set?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That's what the versionchanged says....

orbeckst marked this conversation as resolved.
Show resolved Hide resolved

def apply(self, group):
mask = np.in1d(group.names, self.sug_atoms)
mask &= np.in1d(group.resnames, self.nucl_res)
return group[mask].unique
atomnames = group.universe._topology.names
resnames = group.universe._topology.resnames

# filter by atom names
name_matches = [ix for (nm, ix) in atomnames.namedict.items()
if nm in self.sug_atoms]
nmidx = atomnames.nmidx[group.ix]
group = group[np.in1d(nmidx, name_matches)]

# filter by resnames
resname_matches = [ix for (nm, ix) in resnames.namedict.items()
if nm in self.nucl_res]
nmidx = resnames.nmidx[group.resindices]
group = group[np.in1d(nmidx, resname_matches)]

return group.unique


class PropertySelection(Selection):
Expand Down
Loading