Merge pull request #967: filter: Explicitly drop date/year/month colu…

…mns from metadata during grouping
nextstrain · Jun 30, 2022 · d431b72 · d431b72
2 parents f07cc5f + 4fcdc86
commit d431b72
Show file tree

Hide file tree

Showing 3 changed files with 61 additions and 1 deletion.
diff --git a/CHANGES.md b/CHANGES.md
@@ -8,6 +8,8 @@
 ### Bug Fixes
 
 * filter: Handle errors from `filter_by_query` [#942][] (@victorlin)
+* filter: Explicitly drop date/year/month columns from metadata during grouping. [#967][] (@victorlin)
+    * This fixes a bug [#871][] where `augur filter` would crash with a cryptic `ValueError` if `year` and/or `month` is a custom column in the input metadata and also included in `--group-by`.
 * translate: output nuc annotation when reading from gff3 gene map [#976][] (@corneliusroemer)
 * CI: Remove step for selecting PyPI instance [#974][] (@victorlin)
 * CI: Add token to use GitHub CLI [#958][] (@victorlin)
@@ -16,6 +18,8 @@
 [#976]: https://github.com/nextstrain/augur/pull/976
 [#974]: https://github.com/nextstrain/augur/pull/974
 [#958]: https://github.com/nextstrain/augur/pull/958
+[#967]: https://github.com/nextstrain/augur/pull/967
+[#871]: https://github.com/nextstrain/augur/issues/871
 
 ## 16.0.0 (16 June 2022)
 

diff --git a/augur/filter.py b/augur/filter.py
@@ -931,6 +931,14 @@ def get_groups_for_subsampling(strains, metadata, group_by=None):
 
     # date requested
     if 'year' in group_by_set or 'month' in group_by_set:
+
+        if 'year' in metadata.columns and 'year' in group_by_set:
+            print(f"WARNING: `--group-by year` uses the generated year value from the 'date' column. The custom 'year' column in the metadata is ignored for grouping purposes.", file=sys.stderr)
+            metadata.drop('year', axis=1, inplace=True)
+        if 'month' in metadata.columns and 'month' in group_by_set:
+            print(f"WARNING: `--group-by month` uses the generated month value from the 'date' column. The custom 'month' column in the metadata is ignored for grouping purposes.", file=sys.stderr)
+            metadata.drop('month', axis=1, inplace=True)
+
         if 'date' not in metadata:
             # set year/month/day = unknown
             print(f"WARNING: A 'date' column could not be found to group-by year or month.", file=sys.stderr)
@@ -1149,7 +1157,10 @@ def register_arguments(parser):
     sequence_filter_group.add_argument('--non-nucleotide', action='store_true', help="exclude sequences that contain illegal characters")
 
     subsample_group = parser.add_argument_group("subsampling", "options to subsample filtered data")
-    subsample_group.add_argument('--group-by', nargs='+', help="categories with respect to subsample; two virtual fields, \"month\" and \"year\", are supported if they don't already exist as real fields but a \"date\" field does exist")
+    subsample_group.add_argument('--group-by', nargs='+', help="""
+        categories with respect to subsample.
+        Grouping by 'year' and/or 'month' is only supported when there is a 'date' column in the metadata.
+        Custom 'year' and 'month' columns in the metadata are ignored for grouping. Please rename them if you want to use their values for grouping.""")
     subsample_limits_group = subsample_group.add_mutually_exclusive_group()
     subsample_limits_group.add_argument('--sequences-per-group', type=int, help="subsample to no more than this number of sequences per category")
     subsample_limits_group.add_argument('--subsample-max-sequences', type=int, help="subsample to no more than this number of sequences; can be used without the group_by argument")

diff --git a/tests/functional/filter/cram/subsample-group-by-with-custom-year-column.t b/tests/functional/filter/cram/subsample-group-by-with-custom-year-column.t
@@ -0,0 +1,45 @@
+Setup
+
+  $ pushd "$TESTDIR" > /dev/null
+  $ source _setup.sh
+
+Create a metadata file with a custom year column
+
+  $ cat >$TMP/metadata-year-column.tsv <<~~
+  > strain	date	year	month
+  > SEQ1	2021-01-01	odd	January
+  > SEQ2	2021-01-02	odd	January
+  > SEQ3	2022-01-01	even	January
+  > SEQ4	2022-01-02	even	January
+  > SEQ5	2022-02-02	even	February
+  > ~~
+
+Group by generated year column, and ensure all original columns are still in the final output.
+
+  $ ${AUGUR} filter \
+  >  --metadata $TMP/metadata-year-column.tsv \
+  >  --group-by year \
+  >  --sequences-per-group 1 \
+  >  --subsample-seed 0 \
+  >  --output-metadata "$TMP/filtered_metadata.tsv" > /dev/null
+  WARNING: `--group-by year` uses the generated year value from the 'date' column. The custom 'year' column in the metadata is ignored for grouping purposes.
+  $ cat "$TMP/filtered_metadata.tsv"
+  strain\tdate\tyear\tmonth (esc)
+  SEQ1\t2021-01-01\todd\tJanuary (esc)
+  SEQ5\t2022-02-02\teven\tFebruary (esc)
+
+Group by generated year and month columns, and ensure all original columns are still in the final output.
+
+  $ ${AUGUR} filter \
+  >  --metadata $TMP/metadata-year-column.tsv \
+  >  --group-by year month \
+  >  --sequences-per-group 1 \
+  >  --subsample-seed 0 \
+  >  --output-metadata "$TMP/filtered_metadata.tsv" > /dev/null
+  WARNING: `--group-by year` uses the generated year value from the 'date' column. The custom 'year' column in the metadata is ignored for grouping purposes.
+  WARNING: `--group-by month` uses the generated month value from the 'date' column. The custom 'month' column in the metadata is ignored for grouping purposes.
+  $ cat "$TMP/filtered_metadata.tsv"
+  strain\tdate\tyear\tmonth (esc)
+  SEQ1\t2021-01-01\todd\tJanuary (esc)
+  SEQ3\t2022-01-01\teven\tJanuary (esc)
+  SEQ5\t2022-02-02\teven\tFebruary (esc)