Allow repeating an option that takes multiple values

Many command line arguments take multiple values. Previously, to utilize this feature, all values must be specified after a single option flag (e.g. --exclude-where 'region=A' 'region=B'). If options were set using separate option flags (e.g. --exclude-where 'region=A' --exclude-where 'region=B'), only the last flag would be used, which is unintuitive. This commit enables all option flags to be used. Done across the codebase by adding action='extend' to all options that use nargs='+' and did not already specify an action. A side effect of action='extend' is that it extends instead of replacing a default value defined in add_argument. For arguments that use a custom default value, use a custom argparse action ExtendOverwriteDefault, based on the similarly named AppendOverwriteDefault from Nextstrain CLI¹. ¹ <https://github.com/nextstrain/cli/blob/9bf646b0c795d04658a6f6807d74428b7c173995/nextstrain/cli/argparse.py#L183-L197>
nextstrain · Apr 22, 2024 · 03b49da · 03b49da
1 parent bac41fa
commit 03b49da
Show file tree

Hide file tree

Showing 27 changed files with 103 additions and 70 deletions.
diff --git a/augur/align.py b/augur/align.py
@@ -24,7 +24,7 @@ def register_arguments(parser):
     Kept as a separate function than `register_parser` to continue to support
     unit tests that use this function to create argparser.
     """
-    parser.add_argument('--sequences', '-s', required=True, nargs="+", metavar="FASTA", help="sequences to align")
+    parser.add_argument('--sequences', '-s', required=True, nargs="+", action="extend", metavar="FASTA", help="sequences to align")
     parser.add_argument('--output', '-o', default="alignment.fasta", help="output file (default: %(default)s)")
     parser.add_argument('--nthreads', type=nthreads_value, default=1,
                                 help="number of threads to use; specifying the value 'auto' will cause the number of available CPU cores on your system, if determinable, to be used")

diff --git a/augur/ancestral.py b/augur/ancestral.py
@@ -315,7 +315,7 @@ def register_parser(parent_subparsers):
     )
     amino_acid_options_group.add_argument('--annotation',
                         help='GenBank or GFF file containing the annotation')
-    amino_acid_options_group.add_argument('--genes', nargs='+', help="genes to translate (list or file containing list)")
+    amino_acid_options_group.add_argument('--genes', nargs='+', action='extend', help="genes to translate (list or file containing list)")
     amino_acid_options_group.add_argument('--translations', type=str, help="translated alignments for each CDS/Gene. "
                            "Currently only supported for FASTA-input. Specify the file name via a "
                            "template like 'aa_sequences_%%GENE.fasta' where %%GENE will be replaced "

diff --git a/augur/argparse_.py b/augur/argparse_.py
@@ -76,3 +76,20 @@ class HideAsFalseAction(Action):
     """
     def __call__(self, parser, namespace, values, option_string=None):
         setattr(namespace, self.dest, option_string[2:6] != 'hide')
+
+
+class ExtendOverwriteDefault(Action):
+    """
+    Similar to the core argparse ``extend`` action, but overwrites the argument
+    ``default``, if any, instead of appending to it.
+
+    Thus, the ``default`` value is not included when the option is given and
+    may be a non-list value if desired.
+    """
+    def __call__(self, parser, namespace, value, option_string = None):
+        current = getattr(namespace, self.dest, None)
+
+        if current is parser.get_default(self.dest) or current is None:
+            current = []
+
+        setattr(namespace, self.dest, [*current, *value])
diff --git a/augur/clades.py b/augur/clades.py
@@ -341,8 +341,8 @@ def parse_nodes(tree_file, node_data_files):
 def register_parser(parent_subparsers):
     parser = parent_subparsers.add_parser("clades", help=__doc__)
     parser.add_argument('--tree', required=True, help="prebuilt Newick -- no tree will be built if provided")
-    parser.add_argument('--mutations', required=True, metavar="NODE_DATA_JSON", nargs='+', help='JSON(s) containing ancestral and tip nucleotide and/or amino-acid mutations ')
-    parser.add_argument('--reference', nargs='+', help=SUPPRESS)
+    parser.add_argument('--mutations', required=True, metavar="NODE_DATA_JSON", nargs='+', action='extend', help='JSON(s) containing ancestral and tip nucleotide and/or amino-acid mutations ')
+    parser.add_argument('--reference', nargs='+', action='extend', help=SUPPRESS)
     parser.add_argument('--clades', required=True, metavar="TSV", type=str, help='TSV file containing clade definitions by amino-acid')
     parser.add_argument('--output-node-data', type=str,  metavar="NODE_DATA_JSON", help='name of JSON file to save clade assignments to')
     parser.add_argument('--membership-name', type=str, default="clade_membership", help='Key to store clade membership under; use "None" to not export this')

diff --git a/augur/curate/__init__.py b/augur/curate/__init__.py
@@ -6,7 +6,7 @@
 from collections import deque
 from textwrap import dedent
 
-from augur.argparse_ import add_command_subparsers
+from augur.argparse_ import ExtendOverwriteDefault, add_command_subparsers
 from augur.errors import AugurError
 from augur.io.json import dump_ndjson, load_ndjson
 from augur.io.metadata import DEFAULT_DELIMITERS, InvalidDelimiter, read_table_to_dict, read_metadata_with_sequences, write_records_to_tsv
@@ -53,7 +53,7 @@ def create_shared_parser():
         help="Name of the metadata column that contains the record identifier for reporting duplicate records. "
              "Uses the first column of the metadata file if not provided. "
              "Ignored if also providing a FASTA file input.")
-    shared_inputs.add_argument("--metadata-delimiters", default=DEFAULT_DELIMITERS, nargs="+",
+    shared_inputs.add_argument("--metadata-delimiters", default=DEFAULT_DELIMITERS, nargs="+", action=ExtendOverwriteDefault,
         help="Delimiters to accept when reading a metadata file. Only one delimiter will be inferred.")
 
     shared_inputs.add_argument("--fasta",

diff --git a/augur/curate/format_dates.py b/augur/curate/format_dates.py
@@ -18,9 +18,9 @@ def register_parser(parent_subparsers):
         help=__doc__)
 
     required = parser.add_argument_group(title="REQUIRED")
-    required.add_argument("--date-fields", nargs="+",
+    required.add_argument("--date-fields", nargs="+", action="extend",
         help="List of date field names in the record that need to be standardized.")
-    required.add_argument("--expected-date-formats", nargs="+",
+    required.add_argument("--expected-date-formats", nargs="+", action="extend",
         help="Expected date formats that are currently in the provided date fields, " +
              "defined by standard format codes as listed at " +
              "https://docs.python.org/3/library/datetime.html#strftime-and-strptime-format-codes. " +

diff --git a/augur/distance.py b/augur/distance.py
@@ -660,11 +660,11 @@ def get_distances_to_all_pairs(tree, sequences_by_node_and_gene, distance_map, e
 def register_parser(parent_subparsers):
     parser = parent_subparsers.add_parser("distance", help=first_line(__doc__))
     parser.add_argument("--tree", help="Newick tree", required=True)
-    parser.add_argument("--alignment", nargs="+", help="sequence(s) to be used, supplied as FASTA files", required=True)
-    parser.add_argument('--gene-names', nargs="+", type=str, help="names of the sequences in the alignment, same order assumed", required=True)
-    parser.add_argument("--attribute-name", nargs="+", help="name to store distances associated with the given distance map; multiple attribute names are linked to corresponding positional comparison method and distance map arguments", required=True)
-    parser.add_argument("--compare-to", nargs="+", choices=["root", "ancestor", "pairwise"], help="type of comparison between samples in the given tree including comparison of all nodes to the root (root), all tips to their last ancestor from a previous season (ancestor), or all tips from the current season to all tips in previous seasons (pairwise)", required=True)
-    parser.add_argument("--map", nargs="+", help="JSON providing the distance map between sites and, optionally, sequences present at those sites; the distance map JSON minimally requires a 'default' field defining a default numeric distance and a 'map' field defining a dictionary of genes and one-based coordinates", required=True)
+    parser.add_argument("--alignment", nargs="+", action="extend", help="sequence(s) to be used, supplied as FASTA files", required=True)
+    parser.add_argument('--gene-names', nargs="+", action="extend", type=str, help="names of the sequences in the alignment, same order assumed", required=True)
+    parser.add_argument("--attribute-name", nargs="+", action="extend", help="name to store distances associated with the given distance map; multiple attribute names are linked to corresponding positional comparison method and distance map arguments", required=True)
+    parser.add_argument("--compare-to", nargs="+", action="extend", choices=["root", "ancestor", "pairwise"], help="type of comparison between samples in the given tree including comparison of all nodes to the root (root), all tips to their last ancestor from a previous season (ancestor), or all tips from the current season to all tips in previous seasons (pairwise)", required=True)
+    parser.add_argument("--map", nargs="+", action="extend", help="JSON providing the distance map between sites and, optionally, sequences present at those sites; the distance map JSON minimally requires a 'default' field defining a default numeric distance and a 'map' field defining a dictionary of genes and one-based coordinates", required=True)
     parser.add_argument("--date-annotations", help="JSON of branch lengths and date annotations from augur refine for samples in the given tree; required for comparisons to earliest or latest date")
     parser.add_argument("--earliest-date", help="earliest date at which samples are considered to be from previous seasons (e.g., 2019-01-01). This date is only used in pairwise comparisons. If omitted, all samples prior to the latest date will be considered.")
     parser.add_argument("--latest-date", help="latest date at which samples are considered to be from previous seasons (e.g., 2019-01-01); samples from any date after this are considered part of the current season")

diff --git a/augur/export_v1.py b/augur/export_v1.py
@@ -9,6 +9,7 @@
 from Bio import Phylo
 from argparse import SUPPRESS
 from collections import defaultdict
+from .argparse_ import ExtendOverwriteDefault
 from .errors import AugurError
 from .io.metadata import DEFAULT_DELIMITERS, InvalidDelimiter, read_metadata
 from .utils import read_node_data, write_json, read_config, read_lat_longs, read_colors
@@ -312,7 +313,7 @@ def add_core_args(parser):
     core = parser.add_argument_group("REQUIRED")
     core.add_argument('--tree','-t', required=True, help="tree to perform trait reconstruction on")
     core.add_argument('--metadata', required=True, metavar="FILE", help="sequence metadata")
-    core.add_argument('--metadata-delimiters', default=DEFAULT_DELIMITERS, nargs="+",
+    core.add_argument('--metadata-delimiters', default=DEFAULT_DELIMITERS, nargs="+", action=ExtendOverwriteDefault,
                       help="delimiters to accept when reading a metadata file. Only one delimiter will be inferred.")
     core.add_argument('--node-data', required=True, nargs='+', action="extend", help="JSON files with meta data for each node")
     core.add_argument('--output-tree', help="JSON file name that is passed on to auspice (e.g., zika_tree.json).")

diff --git a/augur/export_v2.py b/augur/export_v2.py
@@ -11,6 +11,7 @@
 import re
 from Bio import Phylo
 
+from .argparse_ import ExtendOverwriteDefault
 from .errors import AugurError
 from .io.file import open_file
 from .io.metadata import DEFAULT_DELIMITERS, DEFAULT_ID_COLUMNS, InvalidDelimiter, read_metadata
@@ -889,21 +890,21 @@ def register_parser(parent_subparsers):
     config.add_argument('--maintainers', metavar="name", action="append", nargs='+', help="Analysis maintained by, in format 'Name <URL>' 'Name2 <URL>', ...")
     config.add_argument('--build-url', type=str, metavar="url", help="Build URL/repository to be displayed by Auspice")
     config.add_argument('--description', metavar="description.md", help="Markdown file with description of build and/or acknowledgements to be displayed by Auspice")
-    config.add_argument('--geo-resolutions', metavar="trait", nargs='+', help="Geographic traits to be displayed on map")
-    config.add_argument('--color-by-metadata', metavar="trait", nargs='+', help="Metadata columns to include as coloring options")
-    config.add_argument('--metadata-columns', nargs="+",
+    config.add_argument('--geo-resolutions', metavar="trait", nargs='+', action='extend', help="Geographic traits to be displayed on map")
+    config.add_argument('--color-by-metadata', metavar="trait", nargs='+', action='extend', help="Metadata columns to include as coloring options")
+    config.add_argument('--metadata-columns', nargs="+", action="extend",
                                  help="Metadata columns to export in addition to columns provided by --color-by-metadata or colorings in the Auspice configuration file. " +
                                       "These columns will not be used as coloring options in Auspice but will be visible in the tree.")
-    config.add_argument('--panels', metavar="panels", nargs='+', choices=['tree', 'map', 'entropy', 'frequencies', 'measurements'], help="Restrict panel display in auspice. Options are %(choices)s. Ignore this option to display all available panels.")
+    config.add_argument('--panels', metavar="panels", nargs='+', action='extend', choices=['tree', 'map', 'entropy', 'frequencies', 'measurements'], help="Restrict panel display in auspice. Options are %(choices)s. Ignore this option to display all available panels.")
 
     optional_inputs = parser.add_argument_group(
         title="OPTIONAL INPUT FILES"
     )
     optional_inputs.add_argument('--node-data', metavar="JSON", nargs='+', action="extend", help="JSON files containing metadata for nodes in the tree")
     optional_inputs.add_argument('--metadata', metavar="FILE", help="Additional metadata for strains in the tree")
-    optional_inputs.add_argument('--metadata-delimiters', default=DEFAULT_DELIMITERS, nargs="+",
+    optional_inputs.add_argument('--metadata-delimiters', default=DEFAULT_DELIMITERS, nargs="+", action=ExtendOverwriteDefault,
                                  help="delimiters to accept when reading a metadata file. Only one delimiter will be inferred.")
-    optional_inputs.add_argument('--metadata-id-columns', default=DEFAULT_ID_COLUMNS, nargs="+",
+    optional_inputs.add_argument('--metadata-id-columns', default=DEFAULT_ID_COLUMNS, nargs="+", action=ExtendOverwriteDefault,
                                  help="names of possible metadata columns containing identifier information, ordered by priority. Only one ID column will be inferred.")
     optional_inputs.add_argument('--colors', metavar="FILE", help="Custom color definitions, one per line in the format `TRAIT_TYPE\\tTRAIT_VALUE\\tHEX_CODE`")
     optional_inputs.add_argument('--lat-longs', metavar="TSV", help="Latitudes and longitudes for geography traits (overrides built in mappings)")

diff --git a/augur/filter/__init__.py b/augur/filter/__init__.py
@@ -1,6 +1,7 @@
 """
 Filter and subsample a sequence set.
 """
+from augur.argparse_ import ExtendOverwriteDefault
 from augur.dates import numeric_date_type, SUPPORTED_DATE_HELP_TEXT
 from augur.filter.io import ACCEPTED_TYPES, column_type_pair
 from augur.io.metadata import DEFAULT_DELIMITERS, DEFAULT_ID_COLUMNS, METADATA_DATE_COLUMN
@@ -19,8 +20,8 @@ def register_arguments(parser):
     input_group.add_argument('--sequences', '-s', help="sequences in FASTA or VCF format")
     input_group.add_argument('--sequence-index', help="sequence composition report generated by augur index. If not provided, an index will be created on the fly.")
     input_group.add_argument('--metadata-chunk-size', type=int, default=100000, help="maximum number of metadata records to read into memory at a time. Increasing this number can speed up filtering at the cost of more memory used.")
-    input_group.add_argument('--metadata-id-columns', default=DEFAULT_ID_COLUMNS, nargs="+", help="names of possible metadata columns containing identifier information, ordered by priority. Only one ID column will be inferred.")
-    input_group.add_argument('--metadata-delimiters', default=DEFAULT_DELIMITERS, nargs="+", help="delimiters to accept when reading a metadata file. Only one delimiter will be inferred.")
+    input_group.add_argument('--metadata-id-columns', default=DEFAULT_ID_COLUMNS, nargs="+", action=ExtendOverwriteDefault, help="names of possible metadata columns containing identifier information, ordered by priority. Only one ID column will be inferred.")
+    input_group.add_argument('--metadata-delimiters', default=DEFAULT_DELIMITERS, nargs="+", action=ExtendOverwriteDefault, help="delimiters to accept when reading a metadata file. Only one delimiter will be inferred.")
 
     metadata_filter_group = parser.add_argument_group("metadata filters", "filters to apply to metadata")
     metadata_filter_group.add_argument(
@@ -29,7 +30,7 @@ def register_arguments(parser):
         Uses Pandas Dataframe querying, see https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#indexing-query for syntax.
         (e.g., --query "country == 'Colombia'" or --query "(country == 'USA' & (division == 'Washington'))")"""
     )
-    metadata_filter_group.add_argument('--query-columns', type=column_type_pair, nargs="+", help=f"""
+    metadata_filter_group.add_argument('--query-columns', type=column_type_pair, nargs="+", action="extend", help=f"""
         Use alongside --query to specify columns and data types in the format 'column:type', where type is one of ({','.join(ACCEPTED_TYPES)}).
         Automatic type inference will be attempted on all unspecified columns used in the query.
         Example: region:str coverage:float.
@@ -38,12 +39,12 @@ def register_arguments(parser):
     metadata_filter_group.add_argument('--max-date', type=numeric_date_type, help=f"maximal cutoff for date, the cutoff date is inclusive; may be specified as: {SUPPORTED_DATE_HELP_TEXT}")
     metadata_filter_group.add_argument('--exclude-ambiguous-dates-by', choices=['any', 'day', 'month', 'year'],
                                 help='Exclude ambiguous dates by day (e.g., 2020-09-XX), month (e.g., 2020-XX-XX), year (e.g., 200X-10-01), or any date fields. An ambiguous year makes the corresponding month and day ambiguous, too, even if those fields have unambiguous values (e.g., "201X-10-01"). Similarly, an ambiguous month makes the corresponding day ambiguous (e.g., "2010-XX-01").')
-    metadata_filter_group.add_argument('--exclude', type=str, nargs="+", help="file(s) with list of strains to exclude")
-    metadata_filter_group.add_argument('--exclude-where', nargs='+',
+    metadata_filter_group.add_argument('--exclude', type=str, nargs="+", action="extend", help="file(s) with list of strains to exclude")
+    metadata_filter_group.add_argument('--exclude-where', nargs='+', action='extend',
                                 help="Exclude samples matching these conditions. Ex: \"host=rat\" or \"host!=rat\". Multiple values are processed as OR (matching any of those specified will be excluded), not AND")
     metadata_filter_group.add_argument('--exclude-all', action="store_true", help="exclude all strains by default. Use this with the include arguments to select a specific subset of strains.")
-    metadata_filter_group.add_argument('--include', type=str, nargs="+", help="file(s) with list of strains to include regardless of priorities, subsampling, or absence of an entry in --sequences.")
-    metadata_filter_group.add_argument('--include-where', nargs='+', help="""
+    metadata_filter_group.add_argument('--include', type=str, nargs="+", action="extend", help="file(s) with list of strains to include regardless of priorities, subsampling, or absence of an entry in --sequences.")
+    metadata_filter_group.add_argument('--include-where', nargs='+', action='extend', help="""
         Include samples with these values. ex: host=rat. Multiple values are
         processed as OR (having any of those specified will be included), not
         AND. This rule is applied last and ensures any strains matching these
@@ -56,7 +57,7 @@ def register_arguments(parser):
     sequence_filter_group.add_argument('--non-nucleotide', action='store_true', help="exclude sequences that contain illegal characters")
 
     subsample_group = parser.add_argument_group("subsampling", "options to subsample filtered data")
-    subsample_group.add_argument('--group-by', nargs='+', help=f"""
+    subsample_group.add_argument('--group-by', nargs='+', action='extend', help=f"""
         categories with respect to subsample.
         Notes:
         (1) Grouping by {sorted(constants.GROUP_BY_GENERATED_COLUMNS)} is only supported when there is a {METADATA_DATE_COLUMN!r} column in the metadata.