Skip to content

Commit

Permalink
Merge branch 'master' into count-indels-once
Browse files Browse the repository at this point in the history
  • Loading branch information
benjaminotter authored Apr 13, 2021
2 parents 921c104 + c663c95 commit e392e09
Show file tree
Hide file tree
Showing 40 changed files with 2,973 additions and 137 deletions.
4 changes: 2 additions & 2 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -58,5 +58,5 @@ Thumbs.db
*sublime*
.vscode

# slurm logs
builds/flu/slurm_log/*
# Recursively ignore log files
**/*.log
20 changes: 20 additions & 0 deletions CHANGES.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,26 @@
## __NEXT__


## 11.3.0 (19 March 2021)

### Bug Fixes

* filter: Clarify how the `--priority` input affects subsampling in the command line help documentation [#695][]
* tests: Clean up outputs created by tests [#703][], ignore log files [#701][], and remove unnecessary Conda environment file [#702][]

### Features

* io: Add new `io` module with `open_file`, `read_sequences`, and `write_sequences` functions that support compressed inputs and outputs [#652][]
* parse, index, filter, mask: Add support for compressed inputs/outputs [#652][]
* export v2: Add optional `data_provenance` field to auspice JSON output for better provenance reporting in Auspice [#705][]

[#652]: https://github.com/nextstrain/augur/pull/652
[#695]: https://github.com/nextstrain/augur/pull/695
[#701]: https://github.com/nextstrain/augur/pull/701
[#702]: https://github.com/nextstrain/augur/pull/702
[#703]: https://github.com/nextstrain/augur/pull/703
[#705]: https://github.com/nextstrain/augur/pull/705

## 11.2.0 (8 March 2021)

### Bug Fixes
Expand Down
2 changes: 1 addition & 1 deletion augur/__version__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
__version__ = '11.2.0'
__version__ = '11.3.0'


def is_augur_version_compatible(version):
Expand Down
23 changes: 22 additions & 1 deletion augur/data/schema-auspice-config-v2.json
Original file line number Diff line number Diff line change
Expand Up @@ -167,6 +167,27 @@
"type": "object",
"description": "UNUSED v1 syntax for defining vaccine choices",
"$comment": "This is unused in `augur export v2` which gets vaccine info vis a node-data JSON file. It remains in the schema so that v1 config files can be used by `augur export v2`"
},
"data_provenance": {
"description": "Specify provenance of data included in this analysis",
"type": "array",
"minItems": 1,
"items": {
"type": "object",
"description": "An individual data source",
"additionalProperties": false,
"required": ["name"],
"properties": {
"name": {
"description": "Name of the data source",
"type": "string"
},
"url": {
"description": "URL to use in link to data source",
"type": "string"
}
}
}
}
}
}
}
21 changes: 21 additions & 0 deletions augur/data/schema-export-v2.json
Original file line number Diff line number Diff line change
Expand Up @@ -243,6 +243,27 @@
"$comment": "Frequencies could be specified here if desired",
"$comment": "If not specified, and frequencies are asked for in #/panels, then Auspice will attempt to fetch a seperate JSON",
"$comment": "cc John / Trevor"
},
"data_provenance": {
"description": "Specify provenance of data included in this analysis",
"type": "array",
"minItems": 1,
"items": {
"type": "object",
"description": "An individual data source",
"additionalProperties": false,
"required": ["name"],
"properties": {
"name": {
"description": "Name of the data source",
"type": "string"
},
"url": {
"description": "URL to use in link to data source",
"type": "string"
}
}
}
}
}
},
Expand Down
15 changes: 14 additions & 1 deletion augur/distance.py
Original file line number Diff line number Diff line change
Expand Up @@ -331,9 +331,20 @@ def get_distance_between_nodes(node_a_sequences, node_b_sequences, distance_map,
>>> get_distance_between_nodes(node_a_sequences, node_b_sequences, distance_map)
3
Ignore specific characters defined in the distance map.
>>> node_a_sequences = {"gene": "ACTGG"}
>>> node_b_sequences = {"gene": "A--GN"}
>>> distance_map = {"default": 1, "ignored_characters":["-"], "map": {}}
>>> get_distance_between_nodes(node_a_sequences, node_b_sequences, distance_map)
1
>>> distance_map = {"default": 1, "ignored_characters":["-", "N"], "map": {}}
>>> get_distance_between_nodes(node_a_sequences, node_b_sequences, distance_map)
0
"""
distance_type = type(distance_map["default"])
distance = distance_type(0)
ignored_characters = distance_map.get("ignored_characters",[])

for gene in node_a_sequences:
gene_length = len(node_a_sequences[gene])
Expand All @@ -345,7 +356,9 @@ def get_distance_between_nodes(node_a_sequences, node_b_sequences, distance_map,
mismatches_by_site = defaultdict(set)
in_gap = False
for site in range(gene_length):
if node_a_sequences[gene][site] != node_b_sequences[gene][site]:
if (node_a_sequences[gene][site] != node_b_sequences[gene][site]
and node_a_sequences[gene][site] not in ignored_characters
and node_b_sequences[gene][site] not in ignored_characters):
if node_a_sequences[gene][site] == "-" or node_b_sequences[gene][site] == "-":
if not in_gap:
gap_start = site
Expand Down
22 changes: 22 additions & 0 deletions augur/export_v2.py
Original file line number Diff line number Diff line change
Expand Up @@ -433,6 +433,27 @@ def set_panels(data_json, config, cmd_line_panels):
data_json['meta']["panels"] = panels


def set_data_provenance(data_json, config):
"""Set the data provenance from the given config file to the given data JSON.
Parameters
----------
data_json : dict
auspice JSON to be updated
config : dict
config JSON with an expected ``data_provenance`` key
>>> config = {"data_provenance": [{"name": "GISAID"}, {"name": "INSDC"}]}
>>> data_json = {"meta": {}}
>>> set_data_provenance(data_json, config)
>>> data_json["meta"]["data_provenance"][0]["name"]
'GISAID'
"""
if "data_provenance" in config:
data_json["meta"]["data_provenance"] = config["data_provenance"]


def counter_to_disambiguation_suffix(count):
"""Given a numeric count of author papers, return a distinct alphabetical
disambiguation suffix.
Expand Down Expand Up @@ -914,6 +935,7 @@ def run_v2(args):
set_node_attrs_on_tree(data_json, node_attrs)
set_geo_resolutions(data_json, config, args.geo_resolutions, read_lat_longs(args.lat_longs), node_attrs)
set_panels(data_json, config, args.panels)
set_data_provenance(data_json, config)

# Write outputs - the (unified) dataset JSON intended for auspice & perhaps the ref root-sequence JSON
indent = {"indent": None} if args.minify_json else {}
Expand Down
41 changes: 26 additions & 15 deletions augur/filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
import treetime.utils

from .index import index_sequences
from .io import open_file, read_sequences, write_sequences
from .utils import read_metadata, read_strains, get_numerical_dates, run_shell_command, shquote, is_date_ambiguous

comment_char = '#'
Expand Down Expand Up @@ -104,8 +105,8 @@ def register_arguments(parser):
Uses Pandas Dataframe querying, see https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#indexing-query for syntax.
(e.g., --query "country == 'Colombia'" or --query "(country == 'USA' & (division == 'Washington'))")"""
)
metadata_filter_group.add_argument('--min-date', type=numeric_date, help="minimal cutoff for date; may be specified as an Augur-style numeric date (with the year as the integer part) or YYYY-MM-DD")
metadata_filter_group.add_argument('--max-date', type=numeric_date, help="maximal cutoff for date; may be specified as an Augur-style numeric date (with the year as the integer part) or YYYY-MM-DD")
metadata_filter_group.add_argument('--min-date', type=numeric_date, help="minimal cutoff for date, the cutoff date is inclusive; may be specified as an Augur-style numeric date (with the year as the integer part) or YYYY-MM-DD")
metadata_filter_group.add_argument('--max-date', type=numeric_date, help="maximal cutoff for date, the cutoff date is inclusive; may be specified as an Augur-style numeric date (with the year as the integer part) or YYYY-MM-DD")
metadata_filter_group.add_argument('--exclude-ambiguous-dates-by', choices=['any', 'day', 'month', 'year'],
help='Exclude ambiguous dates by day (e.g., 2020-09-XX), month (e.g., 2020-XX-XX), year (e.g., 200X-10-01), or any date fields. An ambiguous year makes the corresponding month and day ambiguous, too, even if those fields have unambiguous values (e.g., "201X-10-01"). Similarly, an ambiguous month makes the corresponding day ambiguous (e.g., "2010-XX-01").')
metadata_filter_group.add_argument('--exclude', type=str, nargs="+", help="file(s) with list of strains to exclude")
Expand All @@ -124,7 +125,7 @@ def register_arguments(parser):
subsample_group.add_argument('--group-by', nargs='+', help="categories with respect to subsample; two virtual fields, \"month\" and \"year\", are supported if they don't already exist as real fields but a \"date\" field does exist")
subsample_limits_group = subsample_group.add_mutually_exclusive_group()
subsample_limits_group.add_argument('--sequences-per-group', type=int, help="subsample to no more than this number of sequences per category")
subsample_limits_group.add_argument('--subsample-max-sequences', type=int, help="subsample to no more than this number of sequences")
subsample_limits_group.add_argument('--subsample-max-sequences', type=int, help="subsample to no more than this number of sequences; can be used without the group_by argument")
probabilistic_sampling_group = subsample_group.add_mutually_exclusive_group()
probabilistic_sampling_group.add_argument('--probabilistic-sampling', action='store_true', help="Enable probabilistic sampling during subsampling. This is useful when there are more groups than requested sequences. This option only applies when `--subsample-max-sequences` is provided.")
probabilistic_sampling_group.add_argument('--no-probabilistic-sampling', action='store_false', dest='probabilistic_sampling')
Expand Down Expand Up @@ -347,9 +348,9 @@ def run(args):
dates = get_numerical_dates(meta_dict, fmt="%Y-%m-%d")
tmp = {s for s in seq_keep if dates[s] is not None}
if args.min_date:
tmp = {s for s in tmp if (np.isscalar(dates[s]) or all(dates[s])) and np.max(dates[s])>args.min_date}
tmp = {s for s in tmp if (np.isscalar(dates[s]) or all(dates[s])) and np.max(dates[s])>=args.min_date}
if args.max_date:
tmp = {s for s in tmp if (np.isscalar(dates[s]) or all(dates[s])) and np.min(dates[s])<args.max_date}
tmp = {s for s in tmp if (np.isscalar(dates[s]) or all(dates[s])) and np.min(dates[s])<=args.max_date}
num_excluded_by_date = len(seq_keep) - len(tmp)
seq_keep = tmp

Expand All @@ -375,16 +376,26 @@ def run(args):
if args.subsample_seed:
random.seed(args.subsample_seed)
num_excluded_subsamp = 0
if args.group_by and (args.sequences_per_group or args.subsample_max_sequences):
if args.subsample_max_sequences or (args.group_by and args.sequences_per_group):

#set groups to group_by values
if args.group_by:
groups = args.group_by
#if group_by not specified use dummy category
else:
groups = ["_dummy"]

spg = args.sequences_per_group
seq_names_by_group = defaultdict(list)

for seq_name in seq_keep:
group = []
m = meta_dict[seq_name]
# collect group specifiers
for c in args.group_by:
if c in m:
for c in groups:
if c == "_dummy":
group.append(c)
elif c in m:
group.append(m[c])
elif c in ['month', 'year'] and 'date' in m:
try:
Expand All @@ -406,16 +417,16 @@ def run(args):

#If didnt find any categories specified, all seqs will be in 'unknown' - but don't sample this!
if len(seq_names_by_group)==1 and ('unknown' in seq_names_by_group or ('unknown',) in seq_names_by_group):
print("WARNING: The specified group-by categories (%s) were not found."%args.group_by,
print("WARNING: The specified group-by categories (%s) were not found."%groups,
"No sequences-per-group sampling will be done.")
if any([x in args.group_by for x in ['year','month']]):
if any([x in groups for x in ['year','month']]):
print("Note that using 'year' or 'year month' requires a column called 'date'.")
print("\n")
else:
# Check to see if some categories are missing to warn the user
group_by = set(['date' if cat in ['year','month'] else cat
for cat in args.group_by])
missing_cats = [cat for cat in group_by if cat not in meta_columns]
for cat in groups])
missing_cats = [cat for cat in group_by if cat not in meta_columns and cat != "_dummy"]
if missing_cats:
print("WARNING:")
if any([cat != 'date' for cat in missing_cats]):
Expand Down Expand Up @@ -548,19 +559,19 @@ def run(args):
dropped_samps = list(available_strains - seq_keep)
write_vcf(args.sequences, args.output, dropped_samps)
elif args.sequences and args.output:
sequences = SeqIO.parse(args.sequences, "fasta")
sequences = read_sequences(args.sequences)

# Stream to disk all sequences that passed all filters to avoid reading
# sequences into memory first. Track the observed strain names in the
# sequence file as part of the single pass to allow comparison with the
# provided sequence index.
observed_sequence_strains = set()
with open(args.output, "w") as output_handle:
with open_file(args.output, "wt") as output_handle:
for sequence in sequences:
observed_sequence_strains.add(sequence.id)

if sequence.id in seq_keep:
SeqIO.write(sequence, output_handle, 'fasta')
write_sequences(sequence, output_handle, 'fasta')

if sequence_strains != observed_sequence_strains:
# Warn the user if the expected strains from the sequence index are
Expand Down
21 changes: 12 additions & 9 deletions augur/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,15 @@
import sys
import csv

from .io import open_file, read_sequences


def register_arguments(parser):
parser.add_argument('--sequences', '-s', required=True, help="sequences in fasta format")
parser.add_argument('--output', '-o', help="tab-delimited file containing the number of bases per sequence in the given file. Output columns include strain, length, and counts for A, C, G, T, N, other valid IUPAC characters, ambiguous characters ('?' and '-'), and other invalid characters.", required=True)
parser.add_argument('--verbose', '-v', action="store_true", help="print index statistics to stdout")


def index_sequence(sequence, values):
"""Count the number of nucleotides for a given sequence record.
Expand Down Expand Up @@ -127,13 +131,7 @@ def index_sequences(sequences_path, sequence_index_path):
total length of sequences indexed
"""
#read in files
try:
seqs = SeqIO.parse(sequences_path, 'fasta')
except ValueError as error:
print("ERROR: Problem reading in {}:".format(sequences_path), file=sys.stderr)
print(error, file=sys.stderr)
return 1
seqs = read_sequences(sequences_path)

other_IUPAC = {'r', 'y', 's', 'w', 'k', 'm', 'd', 'h', 'b', 'v'}
values = [{'a'},{'c'},{'g'},{'t'},{'n'},other_IUPAC,{'-'},{'?'}]
Expand All @@ -142,7 +140,7 @@ def index_sequences(sequences_path, sequence_index_path):
tot_length = 0
num_of_seqs = 0

with open(sequence_index_path, 'wt') as out_file:
with open_file(sequence_index_path, 'wt') as out_file:
tsv_writer = csv.writer(out_file, delimiter = '\t')

#write header i output file
Expand All @@ -166,7 +164,12 @@ def run(args):
("?" and "-"), and other invalid characters in a set of sequences and write
the composition as a data frame to the given sequence index path.
'''
num_of_seqs, tot_length = index_sequences(args.sequences, args.output)
try:
num_of_seqs, tot_length = index_sequences(args.sequences, args.output)
except ValueError as error:
print("ERROR: Problem reading in {}:".format(sequences_path), file=sys.stderr)
print(error, file=sys.stderr)
return 1

if args.verbose:
print("Analysed %i sequences with an average length of %i nucleotides." % (num_of_seqs, int(tot_length / num_of_seqs)))
Loading

0 comments on commit e392e09

Please sign in to comment.