From 9d79b1989b23fc67446144e1c49ad48764b72556 Mon Sep 17 00:00:00 2001 From: Marcus Fedarko Date: Thu, 2 May 2019 20:40:13 -0700 Subject: [PATCH] BUG: Convert "bool" metadata cols to have strings See comments for justification. This came up while I was testing stuff. Related to #62. The easiest solution would probably be using qiime2.Metadata.load(), but I don't want to introduce a heavy dependency on QIIME 2 for people using rankratioviz in isolation. --- rankratioviz/scripts/_plot.py | 19 +++++++++++- rankratioviz/tests/testing_utilities.py | 40 ++++++++++++++++++++++--- 2 files changed, 54 insertions(+), 5 deletions(-) diff --git a/rankratioviz/scripts/_plot.py b/rankratioviz/scripts/_plot.py index 30b7c008..6b6fc5b3 100644 --- a/rankratioviz/scripts/_plot.py +++ b/rankratioviz/scripts/_plot.py @@ -70,7 +70,24 @@ def plot( logging.basicConfig(level=logging.DEBUG) def read_metadata(md_file_loc): - return pd.read_csv(md_file_loc, index_col=0, sep="\t") + """Reads in the metadata file using pandas.read_csv(). + + One slightly strange thing is that pandas.read_csv() interprets + columns containing all values of True / False as booleans. This + causes problems down the line, since these values are converted to + true / false (note the lowercase) when using them in JavaScript. + + To ensure consistency with QIIME 2's metadata guidelines (which only + consider numeric and categorical types), we convert all values in + columns labelled with the bool type to strings. This preserves the + "case" of True / False, and should result in predictable outcomes. + """ + metadata_df = pd.read_csv(md_file_loc, index_col=0, sep="\t") + bool_cols = metadata_df.select_dtypes(include=[bool]).columns + if len(bool_cols) > 0: + type_conv_dict = {col: str for col in bool_cols} + metadata_df = metadata_df.astype(type_conv_dict) + return metadata_df logging.debug("Starting the standalone rrv script.") loaded_biom = load_table(table) diff --git a/rankratioviz/tests/testing_utilities.py b/rankratioviz/tests/testing_utilities.py index 0d0ee5a4..add4e608 100644 --- a/rankratioviz/tests/testing_utilities.py +++ b/rankratioviz/tests/testing_utilities.py @@ -1,6 +1,7 @@ import os import json from pytest import approx +import pandas as pd from click.testing import CliRunner from qiime2 import Artifact, Metadata from qiime2.plugins import rankratioviz as q2rankratioviz @@ -255,6 +256,8 @@ def basic_vegalite_json_validation(json_obj): def validate_rank_plot_json(input_ranks_loc, rank_json): """Ensure that the rank plot JSON makes sense.""" + # TODO check that feature metadata annotations were properly applied to the + # features. Will need the feature metadata file location to be passed here reference_features = rank_file_to_df(input_ranks_loc) # Validate some basic properties of the plot # (This is all handled by Altair, so these property tests aren't @@ -315,8 +318,37 @@ def validate_sample_plot_json(biom_table_loc, metadata_loc, sample_json): assert sample_json["mark"] == "circle" assert sample_json["title"] == "Log Ratio of Abundances in Samples" basic_vegalite_json_validation(sample_json) - # dn = sample_json["data"]["name"] - # TODO check that all metadata samples are accounted for in BIOM table - # TODO check that every log ratio is correct? I guess that'll make us - # load the rank plots file, but it's worth it (tm) + dn = sample_json["data"]["name"] + + # Check that each sample's metadata in the sample plot JSON matches with + # its actual metadata. + sample_metadata = pd.read_csv(metadata_loc, index_col=0, sep="\t") + for sample in sample_json["datasets"][dn]: + sample_id = sample["Sample ID"] + for metadata_col in sample_metadata.columns: + expected_md = sample_metadata.at[sample_id, metadata_col] + actual_md = sample[metadata_col] + # There are some weird things in how boolean values are + # loaded/generated between the qiime2.Metadata interface, + # pandas.read_csv, json.loads(), and Altair, so to make our lives + # easier we just care about the string representation of booleans. + # (TLDR: pandas.read_csv() results in numpy.bool_ types being + # present, which makes things super fun.) + if str(expected_md) in ["True", "False"]: + assert str(expected_md) == str(actual_md) + else: + assert expected_md == actual_md + # Not really "metadata", but just as a sanity check verify that the + # initial rankratioviz_balance of each sample is null (aka None in + # python) -- this ensures that no samples will show up when the + # visualization is initially displayed, which is the intended behavior. + assert sample["rankratioviz_balance"] is None + + # TODO check that every entry (sample x feature) matches with the BIOM + # table. (If the BIOM table has, say, > 1 million entries, this might be + # dumb, but the test data right now is fine.) + # This can be done by using the rankratioviz_feature_col_ids dataset to + # understand what column mappings were set, and then using those column + # mappings to look at the rankratioviz_feature_counts. + # TODO check anything else in the sample plot JSON I'm forgetting