Skip to content

Commit

Permalink
More flexibility with wildcards in selection (#2551)
Browse files Browse the repository at this point in the history
* Fixes #2436 
* Selection strings changed to use fnmatch. This now allows for more flexible wildcard usage as well as for using multiple wildcards at once.
* Added two new tests to match the new functionality
* New doc section on wildcards
* Remove test for multiple wildcards
* Update in AUTHORS
* Update CHANGELOG
  • Loading branch information
Iv-Hristov authored Mar 4, 2020
1 parent bc2f1a5 commit eb18a33
Show file tree
Hide file tree
Showing 5 changed files with 43 additions and 33 deletions.
1 change: 1 addition & 0 deletions package/AUTHORS
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,7 @@ Chronological list of authors
2020
- Charlie Cook
- Yuanyu Chang
- Ivan Hristov
- Michael Quevillon
- Hao Tian

Expand Down
3 changes: 2 additions & 1 deletion package/CHANGELOG
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ The rules for this file:
------------------------------------------------------------------------------
mm/dd/yy richardjgowers, kain88-de, lilyminium, p-j-smith, bdice, joaomcteixeira,
PicoCentauri, davidercruz, jbarnoud, RMeli, IAlibay, mtiberti, CCook96,
Yuan-Yu, xiki-tempula, HTian1997
Yuan-Yu, xiki-tempula, HTian1997, Iv-Hristov

* 0.21.0

Expand Down Expand Up @@ -57,6 +57,7 @@ Fixes
* Added parmed to setup.py

Enhancements
* Changed selection wildcards to support multiple wildcards (#2436)
* Added coordinate reader and writer for NAMD binary coordinate format (PR #2485)
* Improved ClusterCollection and Cluster string representations (Issue #2464)
* XYZ parser store elements attribute (#2420) and XYZ write uses the elements
Expand Down
17 changes: 5 additions & 12 deletions package/MDAnalysis/core/selection.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@

import collections
import re
import fnmatch
import functools
import warnings

Expand Down Expand Up @@ -499,8 +500,8 @@ def apply(self, group):
class StringSelection(Selection):
"""Selections based on text attributes
Supports the use of one wildcard at the start,
end, and middle of strings
.. versionchanged:: 1.0.0
Supports multiple wildcards, based on fnmatch
"""
def __init__(self, parser, tokens):
vals = grab_not_keywords(tokens)
Expand All @@ -512,16 +513,8 @@ def __init__(self, parser, tokens):
def apply(self, group):
mask = np.zeros(len(group), dtype=np.bool)
for val in self.values:
if val.count('*') > 1:
raise SelectionError('Can only use one wildcard in a string')
wc_pos = val.find('*')
if wc_pos == -1: # No wildcard found
mask |= getattr(group, self.field) == val
else:
values = getattr(group, self.field).astype(np.str_)
mask |= np.char.startswith(values, val[:wc_pos])
mask &= np.char.endswith(values, val[wc_pos+1:])

values = getattr(group, self.field)
mask |= [fnmatch.fnmatch(x, val) for x in values]
return group[mask].unique


Expand Down
26 changes: 21 additions & 5 deletions package/doc/sphinx/source/documentation_pages/selections.rst
Original file line number Diff line number Diff line change
Expand Up @@ -46,11 +46,6 @@ selection parser. The following applies to all selections:
necessary).
* Selections are parsed left to right and parentheses can be used for
grouping.
* Currently, a single wildcard ``*`` character can be at the start, middle, or
end of a string for pattern matching. For example, ``GL*`` selects
all strings that start with "GL" such as "GLU", "GLY", "GLX29", "GLN".
``resname *N`` selects all residue names that end in "N", such as "ASN" and
"GLN".


Simple selections
Expand Down Expand Up @@ -104,6 +99,27 @@ moltype *molecule-type*
select by molecule type, e.g. ``moltype Protein_A``. At the moment, only
the TPR format defines the molecule type.

Pattern matching
----------------

The pattern matching notation described below is used to specify
patterns for matching strings (based on :mod:`fnmatch`):

``?``
Is a pattern that will match any single character. For example,
``resname T?R`` selects residues named "TYR" and "THR".
``*``
Is a pattern that will match multiple characters. For example,
``GL*`` selects all strings that start with "GL" such as "GLU",
"GLY", "GLX29", "GLN".
``[seq]``
Would match any character in seq. For example, "resname GL[NY]"
selects all residues named "GLN" or "GLY" but would not select
"GLU".
``[!seq]``
Would match any character not in seq. For example, "resname GL[!NY]"
would match residues named "GLU" but would not match "GLN" or "GLY".

Boolean
-------

Expand Down
29 changes: 14 additions & 15 deletions testsuite/MDAnalysisTests/core/test_atomselections.py
Original file line number Diff line number Diff line change
Expand Up @@ -364,22 +364,22 @@ def test_global(self, universe):
ag2 = ag.select_atoms("around 4 global backbone")
assert_equal(ag2.indices, ag1.indices)

def test_wildcard_middle_selection(self, universe):
ag = universe.select_atoms('resname TYR or resname THR')
ag_wild = universe.select_atoms('resname T*R')
assert ag == ag_wild

def test_wildcard_start_selection(self, universe):
ag = universe.select_atoms('resname ASN GLN')
ag_wild = universe.select_atoms('resname *N')
assert ag == ag_wild

def test_wildcard_terminal_selection(self, universe):
ag = universe.select_atoms('resname ASN ASP')
ag_wild = universe.select_atoms('resname AS*')
@pytest.mark.parametrize('selstring, wildstring', [
('resname TYR THR', 'resname T*R'),
('resname ASN GLN', 'resname *N'),
('resname ASN ASP', 'resname AS*'),
('resname TYR THR', 'resname T?R'),
('resname ASN ASP HSD', 'resname *S?'),
('resname LEU LYS', 'resname L**'),
('resname MET', 'resname *M*'),
('resname GLN GLY', 'resname GL[NY]'),
('resname GLU', 'resname GL[!NY]'),
])
def test_wildcard_selection(self, universe, selstring, wildstring):
ag = universe.select_atoms(selstring)
ag_wild = universe.select_atoms(wildstring)
assert ag == ag_wild


class TestSelectionsAMBER(object):
@pytest.fixture()
def universe(self):
Expand Down Expand Up @@ -927,7 +927,6 @@ def universe():
'index or protein',
'prop mass < 4.0 hello', # unused token
'prop mass > 10. and group this', # missing group
'resname E*Y*Z', # >1 wildcards
])
def test_selection_fail(self, selstr, universe):
with pytest.raises(SelectionError):
Expand Down

0 comments on commit eb18a33

Please sign in to comment.