From 9d79b1989b23fc67446144e1c49ad48764b72556 Mon Sep 17 00:00:00 2001
From: Marcus Fedarko <mfedarko@ucsd.edu>
Date: Thu, 2 May 2019 20:40:13 -0700
Subject: [PATCH] BUG: Convert "bool" metadata cols to have strings

See comments for justification. This came up while I was testing
stuff.

Related to #62. The easiest solution would probably be using
qiime2.Metadata.load(), but I don't want to introduce a heavy
dependency on QIIME 2 for people using rankratioviz in isolation.
---
 rankratioviz/scripts/_plot.py           | 19 +++++++++++-
 rankratioviz/tests/testing_utilities.py | 40 ++++++++++++++++++++++---
 2 files changed, 54 insertions(+), 5 deletions(-)

diff --git a/rankratioviz/scripts/_plot.py b/rankratioviz/scripts/_plot.py
index 30b7c008..6b6fc5b3 100644
--- a/rankratioviz/scripts/_plot.py
+++ b/rankratioviz/scripts/_plot.py
@@ -70,7 +70,24 @@ def plot(
         logging.basicConfig(level=logging.DEBUG)
 
     def read_metadata(md_file_loc):
-        return pd.read_csv(md_file_loc, index_col=0, sep="\t")
+        """Reads in the metadata file using pandas.read_csv().
+
+           One slightly strange thing is that pandas.read_csv() interprets
+           columns containing all values of True / False as booleans. This
+           causes problems down the line, since these values are converted to
+           true / false (note the lowercase) when using them in JavaScript.
+
+           To ensure consistency with QIIME 2's metadata guidelines (which only
+           consider numeric and categorical types), we convert all values in
+           columns labelled with the bool type to strings. This preserves the
+           "case" of True / False, and should result in predictable outcomes.
+        """
+        metadata_df = pd.read_csv(md_file_loc, index_col=0, sep="\t")
+        bool_cols = metadata_df.select_dtypes(include=[bool]).columns
+        if len(bool_cols) > 0:
+            type_conv_dict = {col: str for col in bool_cols}
+            metadata_df = metadata_df.astype(type_conv_dict)
+        return metadata_df
 
     logging.debug("Starting the standalone rrv script.")
     loaded_biom = load_table(table)
diff --git a/rankratioviz/tests/testing_utilities.py b/rankratioviz/tests/testing_utilities.py
index 0d0ee5a4..add4e608 100644
--- a/rankratioviz/tests/testing_utilities.py
+++ b/rankratioviz/tests/testing_utilities.py
@@ -1,6 +1,7 @@
 import os
 import json
 from pytest import approx
+import pandas as pd
 from click.testing import CliRunner
 from qiime2 import Artifact, Metadata
 from qiime2.plugins import rankratioviz as q2rankratioviz
@@ -255,6 +256,8 @@ def basic_vegalite_json_validation(json_obj):
 def validate_rank_plot_json(input_ranks_loc, rank_json):
     """Ensure that the rank plot JSON makes sense."""
 
+    # TODO check that feature metadata annotations were properly applied to the
+    # features. Will need the feature metadata file location to be passed here
     reference_features = rank_file_to_df(input_ranks_loc)
     # Validate some basic properties of the plot
     # (This is all handled by Altair, so these property tests aren't
@@ -315,8 +318,37 @@ def validate_sample_plot_json(biom_table_loc, metadata_loc, sample_json):
     assert sample_json["mark"] == "circle"
     assert sample_json["title"] == "Log Ratio of Abundances in Samples"
     basic_vegalite_json_validation(sample_json)
-    # dn = sample_json["data"]["name"]
-    # TODO check that all metadata samples are accounted for in BIOM table
-    # TODO check that every log ratio is correct? I guess that'll make us
-    # load the rank plots file, but it's worth it (tm)
+    dn = sample_json["data"]["name"]
+
+    # Check that each sample's metadata in the sample plot JSON matches with
+    # its actual metadata.
+    sample_metadata = pd.read_csv(metadata_loc, index_col=0, sep="\t")
+    for sample in sample_json["datasets"][dn]:
+        sample_id = sample["Sample ID"]
+        for metadata_col in sample_metadata.columns:
+            expected_md = sample_metadata.at[sample_id, metadata_col]
+            actual_md = sample[metadata_col]
+            # There are some weird things in how boolean values are
+            # loaded/generated between the qiime2.Metadata interface,
+            # pandas.read_csv, json.loads(), and Altair, so to make our lives
+            # easier we just care about the string representation of booleans.
+            # (TLDR: pandas.read_csv() results in numpy.bool_ types being
+            # present, which makes things super fun.)
+            if str(expected_md) in ["True", "False"]:
+                assert str(expected_md) == str(actual_md)
+            else:
+                assert expected_md == actual_md
+        # Not really "metadata", but just as a sanity check verify that the
+        # initial rankratioviz_balance of each sample is null (aka None in
+        # python) -- this ensures that no samples will show up when the
+        # visualization is initially displayed, which is the intended behavior.
+        assert sample["rankratioviz_balance"] is None
+
+    # TODO check that every entry (sample x feature) matches with the BIOM
+    # table. (If the BIOM table has, say, > 1 million entries, this might be
+    # dumb, but the test data right now is fine.)
+    # This can be done by using the rankratioviz_feature_col_ids dataset to
+    # understand what column mappings were set, and then using those column
+    # mappings to look at the rankratioviz_feature_counts.
+
     # TODO check anything else in the sample plot JSON I'm forgetting