Skip to content

Commit

Permalink
Merge pull request #710 from nextstrain/subsample-without-group-by
Browse files Browse the repository at this point in the history
Allow `--subsample-max-sequences` without `--group-by`
  • Loading branch information
huddlej authored Apr 9, 2021
2 parents 1815565 + bb0e824 commit 6e5c9f7
Show file tree
Hide file tree
Showing 2 changed files with 34 additions and 8 deletions.
26 changes: 18 additions & 8 deletions augur/filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -125,7 +125,7 @@ def register_arguments(parser):
subsample_group.add_argument('--group-by', nargs='+', help="categories with respect to subsample; two virtual fields, \"month\" and \"year\", are supported if they don't already exist as real fields but a \"date\" field does exist")
subsample_limits_group = subsample_group.add_mutually_exclusive_group()
subsample_limits_group.add_argument('--sequences-per-group', type=int, help="subsample to no more than this number of sequences per category")
subsample_limits_group.add_argument('--subsample-max-sequences', type=int, help="subsample to no more than this number of sequences")
subsample_limits_group.add_argument('--subsample-max-sequences', type=int, help="subsample to no more than this number of sequences; can be used without the group_by argument")
probabilistic_sampling_group = subsample_group.add_mutually_exclusive_group()
probabilistic_sampling_group.add_argument('--probabilistic-sampling', action='store_true', help="Enable probabilistic sampling during subsampling. This is useful when there are more groups than requested sequences. This option only applies when `--subsample-max-sequences` is provided.")
probabilistic_sampling_group.add_argument('--no-probabilistic-sampling', action='store_false', dest='probabilistic_sampling')
Expand Down Expand Up @@ -376,16 +376,26 @@ def run(args):
if args.subsample_seed:
random.seed(args.subsample_seed)
num_excluded_subsamp = 0
if args.group_by and (args.sequences_per_group or args.subsample_max_sequences):
if args.subsample_max_sequences or (args.group_by and args.sequences_per_group):

#set groups to group_by values
if args.group_by:
groups = args.group_by
#if group_by not specified use dummy category
else:
groups = ["_dummy"]

spg = args.sequences_per_group
seq_names_by_group = defaultdict(list)

for seq_name in seq_keep:
group = []
m = meta_dict[seq_name]
# collect group specifiers
for c in args.group_by:
if c in m:
for c in groups:
if c == "_dummy":
group.append(c)
elif c in m:
group.append(m[c])
elif c in ['month', 'year'] and 'date' in m:
try:
Expand All @@ -407,16 +417,16 @@ def run(args):

#If didnt find any categories specified, all seqs will be in 'unknown' - but don't sample this!
if len(seq_names_by_group)==1 and ('unknown' in seq_names_by_group or ('unknown',) in seq_names_by_group):
print("WARNING: The specified group-by categories (%s) were not found."%args.group_by,
print("WARNING: The specified group-by categories (%s) were not found."%groups,
"No sequences-per-group sampling will be done.")
if any([x in args.group_by for x in ['year','month']]):
if any([x in groups for x in ['year','month']]):
print("Note that using 'year' or 'year month' requires a column called 'date'.")
print("\n")
else:
# Check to see if some categories are missing to warn the user
group_by = set(['date' if cat in ['year','month'] else cat
for cat in args.group_by])
missing_cats = [cat for cat in group_by if cat not in meta_columns]
for cat in groups])
missing_cats = [cat for cat in group_by if cat not in meta_columns and cat != "_dummy"]
if missing_cats:
print("WARNING:")
if any([cat != 'date' for cat in missing_cats]):
Expand Down
16 changes: 16 additions & 0 deletions tests/functional/filter.t
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,22 @@ With 10 groups to subsample from, this should produce one sequence per group.
\s*10 (re)
$ rm -f "$TMP/filtered.fasta"

Filter with subsampling where no more than 5 sequences are requested and no groups are specified.
This generates a dummy category and subsamples from there. With no-probabilistic-sampling we expect exactly 5 sequences.

$ ${AUGUR} filter \
> --sequences filter/sequences.fasta \
> --sequence-index filter/sequence_index.tsv \
> --metadata filter/metadata.tsv \
> --min-date 2012 \
> --subsample-max-sequences 5 \
> --subsample-seed 314159 \
> --no-probabilistic-sampling \
> --output "$TMP/filtered.fasta" > /dev/null
$ grep ">" "$TMP/filtered.fasta" | wc -l
\s*5 (re)
$ rm -f "$TMP/filtered.fasta"

Try to filter with subsampling when there are more available groups than requested sequences.
This should fail, as probabilistic sampling is explicitly disabled.

Expand Down

0 comments on commit 6e5c9f7

Please sign in to comment.