Merge branch 'master' into count-indels-once

nextstrain · Apr 13, 2021 · e392e09 · e392e09
2 parents 921c104 + c663c95
commit e392e09
Show file tree

Hide file tree

Showing 40 changed files with 2,973 additions and 137 deletions.
diff --git a/.gitignore b/.gitignore
@@ -58,5 +58,5 @@ Thumbs.db
 *sublime*
 .vscode
 
-# slurm logs
-builds/flu/slurm_log/*
+# Recursively ignore log files
+**/*.log
diff --git a/CHANGES.md b/CHANGES.md
@@ -3,6 +3,26 @@
 ## __NEXT__
 
 
+## 11.3.0 (19 March 2021)
+
+### Bug Fixes
+
+* filter: Clarify how the `--priority` input affects subsampling in the command line help documentation [#695][]
+* tests: Clean up outputs created by tests [#703][], ignore log files [#701][], and remove unnecessary Conda environment file [#702][]
+
+### Features
+
+* io: Add new `io` module with `open_file`, `read_sequences`, and `write_sequences` functions that support compressed inputs and outputs [#652][]
+* parse, index, filter, mask: Add support for compressed inputs/outputs [#652][]
+* export v2: Add optional `data_provenance` field to auspice JSON output for better provenance reporting in Auspice [#705][]
+
+[#652]: https://github.com/nextstrain/augur/pull/652
+[#695]: https://github.com/nextstrain/augur/pull/695
+[#701]: https://github.com/nextstrain/augur/pull/701
+[#702]: https://github.com/nextstrain/augur/pull/702
+[#703]: https://github.com/nextstrain/augur/pull/703
+[#705]: https://github.com/nextstrain/augur/pull/705
+
 ## 11.2.0 (8 March 2021)
 
 ### Bug Fixes

diff --git a/augur/__version__.py b/augur/__version__.py
@@ -1,4 +1,4 @@
-__version__ = '11.2.0'
+__version__ = '11.3.0'
 
 
 def is_augur_version_compatible(version):

diff --git a/augur/data/schema-auspice-config-v2.json b/augur/data/schema-auspice-config-v2.json
@@ -167,6 +167,27 @@
             "type": "object",
             "description": "UNUSED v1 syntax for defining vaccine choices",
             "$comment": "This is unused in `augur export v2` which gets vaccine info vis a node-data JSON file. It remains in the schema so that v1 config files can be used by `augur export v2`"
+        },
+        "data_provenance": {
+            "description": "Specify provenance of data included in this analysis",
+            "type": "array",
+            "minItems": 1,
+            "items": {
+                "type": "object",
+                "description": "An individual data source",
+                "additionalProperties": false,
+                "required": ["name"],
+                "properties": {
+                    "name": {
+                        "description": "Name of the data source",
+                        "type": "string"
+                    },
+                    "url": {
+                        "description": "URL to use in link to data source",
+                        "type": "string"
+                    }
+                }
+            }
         }
     }
-}
+}
diff --git a/augur/data/schema-export-v2.json b/augur/data/schema-export-v2.json
@@ -243,6 +243,27 @@
                     "$comment": "Frequencies could be specified here if desired",
                     "$comment": "If not specified, and frequencies are asked for in #/panels, then Auspice will attempt to fetch a seperate JSON",
                     "$comment": "cc John / Trevor"
+                },
+                "data_provenance": {
+                    "description": "Specify provenance of data included in this analysis",
+                    "type": "array",
+                    "minItems": 1,
+                    "items": {
+                        "type": "object",
+                        "description": "An individual data source",
+                        "additionalProperties": false,
+                        "required": ["name"],
+                        "properties": {
+                            "name": {
+                                "description": "Name of the data source",
+                                "type": "string"
+                            },
+                            "url": {
+                                "description": "URL to use in link to data source",
+                                "type": "string"
+                            }
+                        }
+                    }
                 }
             }
         },

diff --git a/augur/distance.py b/augur/distance.py
@@ -331,9 +331,20 @@ def get_distance_between_nodes(node_a_sequences, node_b_sequences, distance_map,
     >>> get_distance_between_nodes(node_a_sequences, node_b_sequences, distance_map)
     3
 
+    Ignore specific characters defined in the distance map.
+
+    >>> node_a_sequences = {"gene": "ACTGG"}
+    >>> node_b_sequences = {"gene": "A--GN"}
+    >>> distance_map = {"default": 1, "ignored_characters":["-"], "map": {}}
+    >>> get_distance_between_nodes(node_a_sequences, node_b_sequences, distance_map)
+    1
+    >>> distance_map = {"default": 1, "ignored_characters":["-", "N"], "map": {}}
+    >>> get_distance_between_nodes(node_a_sequences, node_b_sequences, distance_map)
+    0
     """
     distance_type = type(distance_map["default"])
     distance = distance_type(0)
+    ignored_characters = distance_map.get("ignored_characters",[])
 
     for gene in node_a_sequences:
         gene_length = len(node_a_sequences[gene])
@@ -345,7 +356,9 @@ def get_distance_between_nodes(node_a_sequences, node_b_sequences, distance_map,
         mismatches_by_site = defaultdict(set)
         in_gap = False
         for site in range(gene_length):
-            if node_a_sequences[gene][site] != node_b_sequences[gene][site]:
+            if (node_a_sequences[gene][site] != node_b_sequences[gene][site]
+                and node_a_sequences[gene][site] not in ignored_characters
+                and node_b_sequences[gene][site] not in ignored_characters):
                 if node_a_sequences[gene][site] == "-" or node_b_sequences[gene][site] == "-":
                     if not in_gap:
                         gap_start = site

diff --git a/augur/export_v2.py b/augur/export_v2.py
@@ -433,6 +433,27 @@ def set_panels(data_json, config, cmd_line_panels):
     data_json['meta']["panels"] = panels
 
 
+def set_data_provenance(data_json, config):
+    """Set the data provenance from the given config file to the given data JSON.
+
+    Parameters
+    ----------
+    data_json : dict
+        auspice JSON to be updated
+    config : dict
+        config JSON with an expected ``data_provenance`` key
+
+    >>> config = {"data_provenance": [{"name": "GISAID"}, {"name": "INSDC"}]}
+    >>> data_json = {"meta": {}}
+    >>> set_data_provenance(data_json, config)
+    >>> data_json["meta"]["data_provenance"][0]["name"]
+    'GISAID'
+
+    """
+    if "data_provenance" in config:
+        data_json["meta"]["data_provenance"] = config["data_provenance"]
+
+
 def counter_to_disambiguation_suffix(count):
     """Given a numeric count of author papers, return a distinct alphabetical
     disambiguation suffix.
@@ -914,6 +935,7 @@ def run_v2(args):
     set_node_attrs_on_tree(data_json, node_attrs)
     set_geo_resolutions(data_json, config, args.geo_resolutions, read_lat_longs(args.lat_longs), node_attrs)
     set_panels(data_json, config, args.panels)
+    set_data_provenance(data_json, config)
 
     # Write outputs - the (unified) dataset JSON intended for auspice & perhaps the ref root-sequence JSON
     indent = {"indent": None} if args.minify_json else {}

diff --git a/augur/filter.py b/augur/filter.py
@@ -14,6 +14,7 @@
 import treetime.utils
 
 from .index import index_sequences
+from .io import open_file, read_sequences, write_sequences
 from .utils import read_metadata, read_strains, get_numerical_dates, run_shell_command, shquote, is_date_ambiguous
 
 comment_char = '#'
@@ -104,8 +105,8 @@ def register_arguments(parser):
         Uses Pandas Dataframe querying, see https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#indexing-query for syntax.
         (e.g., --query "country == 'Colombia'" or --query "(country == 'USA' & (division == 'Washington'))")"""
     )
-    metadata_filter_group.add_argument('--min-date', type=numeric_date, help="minimal cutoff for date; may be specified as an Augur-style numeric date (with the year as the integer part) or YYYY-MM-DD")
-    metadata_filter_group.add_argument('--max-date', type=numeric_date, help="maximal cutoff for date; may be specified as an Augur-style numeric date (with the year as the integer part) or YYYY-MM-DD")
+    metadata_filter_group.add_argument('--min-date', type=numeric_date, help="minimal cutoff for date, the cutoff date is inclusive; may be specified as an Augur-style numeric date (with the year as the integer part) or YYYY-MM-DD")
+    metadata_filter_group.add_argument('--max-date', type=numeric_date, help="maximal cutoff for date, the cutoff date is inclusive; may be specified as an Augur-style numeric date (with the year as the integer part) or YYYY-MM-DD")
     metadata_filter_group.add_argument('--exclude-ambiguous-dates-by', choices=['any', 'day', 'month', 'year'],
                                 help='Exclude ambiguous dates by day (e.g., 2020-09-XX), month (e.g., 2020-XX-XX), year (e.g., 200X-10-01), or any date fields. An ambiguous year makes the corresponding month and day ambiguous, too, even if those fields have unambiguous values (e.g., "201X-10-01"). Similarly, an ambiguous month makes the corresponding day ambiguous (e.g., "2010-XX-01").')
     metadata_filter_group.add_argument('--exclude', type=str, nargs="+", help="file(s) with list of strains to exclude")
@@ -124,7 +125,7 @@ def register_arguments(parser):
     subsample_group.add_argument('--group-by', nargs='+', help="categories with respect to subsample; two virtual fields, \"month\" and \"year\", are supported if they don't already exist as real fields but a \"date\" field does exist")
     subsample_limits_group = subsample_group.add_mutually_exclusive_group()
     subsample_limits_group.add_argument('--sequences-per-group', type=int, help="subsample to no more than this number of sequences per category")
-    subsample_limits_group.add_argument('--subsample-max-sequences', type=int, help="subsample to no more than this number of sequences")
+    subsample_limits_group.add_argument('--subsample-max-sequences', type=int, help="subsample to no more than this number of sequences; can be used without the group_by argument")
     probabilistic_sampling_group = subsample_group.add_mutually_exclusive_group()
     probabilistic_sampling_group.add_argument('--probabilistic-sampling', action='store_true', help="Enable probabilistic sampling during subsampling. This is useful when there are more groups than requested sequences. This option only applies when `--subsample-max-sequences` is provided.")
     probabilistic_sampling_group.add_argument('--no-probabilistic-sampling', action='store_false', dest='probabilistic_sampling')
@@ -347,9 +348,9 @@ def run(args):
         dates = get_numerical_dates(meta_dict, fmt="%Y-%m-%d")
         tmp = {s for s in seq_keep if dates[s] is not None}
         if args.min_date:
-            tmp = {s for s in tmp if (np.isscalar(dates[s]) or all(dates[s])) and np.max(dates[s])>args.min_date}
+            tmp = {s for s in tmp if (np.isscalar(dates[s]) or all(dates[s])) and np.max(dates[s])>=args.min_date}
         if args.max_date:
-            tmp = {s for s in tmp if (np.isscalar(dates[s]) or all(dates[s])) and np.min(dates[s])<args.max_date}
+            tmp = {s for s in tmp if (np.isscalar(dates[s]) or all(dates[s])) and np.min(dates[s])<=args.max_date}
         num_excluded_by_date = len(seq_keep) - len(tmp)
         seq_keep = tmp
 
@@ -375,16 +376,26 @@ def run(args):
     if args.subsample_seed:
         random.seed(args.subsample_seed)
     num_excluded_subsamp = 0
-    if args.group_by and (args.sequences_per_group or args.subsample_max_sequences):
+    if args.subsample_max_sequences or (args.group_by and args.sequences_per_group):
+
+        #set groups to group_by values
+        if args.group_by:
+            groups = args.group_by
+        #if group_by not specified use dummy category
+        else:
+            groups = ["_dummy"]
+
         spg = args.sequences_per_group
         seq_names_by_group = defaultdict(list)
 
         for seq_name in seq_keep:
             group = []
             m = meta_dict[seq_name]
             # collect group specifiers
-            for c in args.group_by:
-                if c in m:
+            for c in groups:
+                if c == "_dummy":
+                    group.append(c)
+                elif c in m:
                     group.append(m[c])
                 elif c in ['month', 'year'] and 'date' in m:
                     try:
@@ -406,16 +417,16 @@ def run(args):
 
         #If didnt find any categories specified, all seqs will be in 'unknown' - but don't sample this!
         if len(seq_names_by_group)==1 and ('unknown' in seq_names_by_group or ('unknown',) in seq_names_by_group):
-            print("WARNING: The specified group-by categories (%s) were not found."%args.group_by,
+            print("WARNING: The specified group-by categories (%s) were not found."%groups,
                   "No sequences-per-group sampling will be done.")
-            if any([x in args.group_by for x in ['year','month']]):
+            if any([x in groups for x in ['year','month']]):
                 print("Note that using 'year' or 'year month' requires a column called 'date'.")
             print("\n")
         else:
             # Check to see if some categories are missing to warn the user
             group_by = set(['date' if cat in ['year','month'] else cat
-                            for cat in args.group_by])
-            missing_cats = [cat for cat in group_by if cat not in meta_columns]
+                            for cat in groups])
+            missing_cats = [cat for cat in group_by if cat not in meta_columns and cat != "_dummy"]
             if missing_cats:
                 print("WARNING:")
                 if any([cat != 'date' for cat in missing_cats]):
@@ -548,19 +559,19 @@ def run(args):
         dropped_samps = list(available_strains - seq_keep)
         write_vcf(args.sequences, args.output, dropped_samps)
     elif args.sequences and args.output:
-        sequences = SeqIO.parse(args.sequences, "fasta")
+        sequences = read_sequences(args.sequences)
 
         # Stream to disk all sequences that passed all filters to avoid reading
         # sequences into memory first. Track the observed strain names in the
         # sequence file as part of the single pass to allow comparison with the
         # provided sequence index.
         observed_sequence_strains = set()
-        with open(args.output, "w") as output_handle:
+        with open_file(args.output, "wt") as output_handle:
             for sequence in sequences:
                 observed_sequence_strains.add(sequence.id)
 
                 if sequence.id in seq_keep:
-                    SeqIO.write(sequence, output_handle, 'fasta')
+                    write_sequences(sequence, output_handle, 'fasta')
 
         if sequence_strains != observed_sequence_strains:
             # Warn the user if the expected strains from the sequence index are

diff --git a/augur/index.py b/augur/index.py
@@ -8,11 +8,15 @@
 import sys
 import csv
 
+from .io import open_file, read_sequences
+
+
 def register_arguments(parser):
     parser.add_argument('--sequences', '-s', required=True, help="sequences in fasta format")
     parser.add_argument('--output', '-o', help="tab-delimited file containing the number of bases per sequence in the given file. Output columns include strain, length, and counts for A, C, G, T, N, other valid IUPAC characters, ambiguous characters ('?' and '-'), and other invalid characters.", required=True)
     parser.add_argument('--verbose', '-v', action="store_true", help="print index statistics to stdout")
 
+
 def index_sequence(sequence, values):
     """Count the number of nucleotides for a given sequence record.
 
@@ -127,13 +131,7 @@ def index_sequences(sequences_path, sequence_index_path):
         total length of sequences indexed
 
     """
-    #read in files
-    try:
-        seqs = SeqIO.parse(sequences_path, 'fasta')
-    except ValueError as error:
-        print("ERROR: Problem reading in {}:".format(sequences_path), file=sys.stderr)
-        print(error, file=sys.stderr)
-        return 1
+    seqs = read_sequences(sequences_path)
 
     other_IUPAC = {'r', 'y', 's', 'w', 'k', 'm', 'd', 'h', 'b', 'v'}
     values = [{'a'},{'c'},{'g'},{'t'},{'n'},other_IUPAC,{'-'},{'?'}]
@@ -142,7 +140,7 @@ def index_sequences(sequences_path, sequence_index_path):
     tot_length = 0
     num_of_seqs = 0
 
-    with open(sequence_index_path, 'wt') as out_file:
+    with open_file(sequence_index_path, 'wt') as out_file:
         tsv_writer = csv.writer(out_file, delimiter = '\t')
 
         #write header i output file
@@ -166,7 +164,12 @@ def run(args):
     ("?" and "-"), and other invalid characters in a set of sequences and write
     the composition as a data frame to the given sequence index path.
     '''
-    num_of_seqs, tot_length = index_sequences(args.sequences, args.output)
+    try:
+        num_of_seqs, tot_length = index_sequences(args.sequences, args.output)
+    except ValueError as error:
+        print("ERROR: Problem reading in {}:".format(sequences_path), file=sys.stderr)
+        print(error, file=sys.stderr)
+        return 1
 
     if args.verbose:
         print("Analysed %i sequences with an average length of %i nucleotides." % (num_of_seqs, int(tot_length / num_of_seqs)))