Explicitly drop date/year/month columns from metadata during grouping

- `date` column should not be used for grouping - use generated columns instead. - Any `year`/`month` columns in original metadata should be overridden by generated columns. Without dropping explicitly, a cryptic pandas ValueError occurs.
nextstrain · Jun 28, 2022 · f625fbe · f625fbe
1 parent f07cc5f
commit f625fbe
Show file tree

Hide file tree

Showing 2 changed files with 38 additions and 0 deletions.
diff --git a/augur/filter.py b/augur/filter.py
@@ -931,6 +931,14 @@ def get_groups_for_subsampling(strains, metadata, group_by=None):
 
     # date requested
     if 'year' in group_by_set or 'month' in group_by_set:
+
+        if 'year' in metadata.columns and 'year' in group_by_set:
+            print(f"WARNING: For --group-by purposes, the 'year' column in metadata will be overridden by the generated value from 'date' column.", file=sys.stderr)
+            metadata.drop('year', axis=1, inplace=True)
+        if 'month' in metadata.columns and 'month' in group_by_set:
+            print(f"WARNING: For --group-by purposes, the 'month' column in metadata will be overridden by the generated value from 'date' column.", file=sys.stderr)
+            metadata.drop('month', axis=1, inplace=True)
+
         if 'date' not in metadata:
             # set year/month/day = unknown
             print(f"WARNING: A 'date' column could not be found to group-by year or month.", file=sys.stderr)

diff --git a/tests/functional/filter/cram/subsample-group-by-with-custom-year-column.t b/tests/functional/filter/cram/subsample-group-by-with-custom-year-column.t
@@ -0,0 +1,30 @@
+Setup
+
+  $ pushd "$TESTDIR" > /dev/null
+  $ source _setup.sh
+
+Create a metadata file with a custom year column
+
+  $ cat >$TMP/metadata-year-column.tsv <<~~
+  > strain	date	year
+  > SEQ1	2021-01-01	odd
+  > SEQ2	2021-01-02	odd
+  > SEQ3	2022-01-01	even
+  > SEQ4	2022-01-02	even
+  > SEQ5	2022-02-02	even
+  > ~~
+
+Filter by generated date columns, and ensure the custom year column is still in the final output.
+
+  $ ${AUGUR} filter \
+  >  --metadata $TMP/metadata-year-column.tsv \
+  >  --group-by year month \
+  >  --sequences-per-group 1 \
+  >  --subsample-seed 0 \
+  >  --output-metadata "$TMP/filtered_metadata.tsv" > /dev/null
+  WARNING: For --group-by purposes, the 'year' column in metadata will be overridden by the generated value from 'date' column.
+  $ cat "$TMP/filtered_metadata.tsv"
+  strain\tdate\tyear (esc)
+  SEQ1\t2021-01-01\todd (esc)
+  SEQ3\t2022-01-01\teven (esc)
+  SEQ5\t2022-02-02\teven (esc)