Skip to content

Commit

Permalink
BUG: Convert "bool" metadata cols to have strings
Browse files Browse the repository at this point in the history
See comments for justification. This came up while I was testing
stuff.

Related to #62. The easiest solution would probably be using
qiime2.Metadata.load(), but I don't want to introduce a heavy
dependency on QIIME 2 for people using rankratioviz in isolation.
  • Loading branch information
fedarko committed May 3, 2019
1 parent a7e7a5a commit 9d79b19
Show file tree
Hide file tree
Showing 2 changed files with 54 additions and 5 deletions.
19 changes: 18 additions & 1 deletion rankratioviz/scripts/_plot.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,24 @@ def plot(
logging.basicConfig(level=logging.DEBUG)

def read_metadata(md_file_loc):
return pd.read_csv(md_file_loc, index_col=0, sep="\t")
"""Reads in the metadata file using pandas.read_csv().
One slightly strange thing is that pandas.read_csv() interprets
columns containing all values of True / False as booleans. This
causes problems down the line, since these values are converted to
true / false (note the lowercase) when using them in JavaScript.
To ensure consistency with QIIME 2's metadata guidelines (which only
consider numeric and categorical types), we convert all values in
columns labelled with the bool type to strings. This preserves the
"case" of True / False, and should result in predictable outcomes.
"""
metadata_df = pd.read_csv(md_file_loc, index_col=0, sep="\t")
bool_cols = metadata_df.select_dtypes(include=[bool]).columns
if len(bool_cols) > 0:
type_conv_dict = {col: str for col in bool_cols}
metadata_df = metadata_df.astype(type_conv_dict)
return metadata_df

logging.debug("Starting the standalone rrv script.")
loaded_biom = load_table(table)
Expand Down
40 changes: 36 additions & 4 deletions rankratioviz/tests/testing_utilities.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import os
import json
from pytest import approx
import pandas as pd
from click.testing import CliRunner
from qiime2 import Artifact, Metadata
from qiime2.plugins import rankratioviz as q2rankratioviz
Expand Down Expand Up @@ -255,6 +256,8 @@ def basic_vegalite_json_validation(json_obj):
def validate_rank_plot_json(input_ranks_loc, rank_json):
"""Ensure that the rank plot JSON makes sense."""

# TODO check that feature metadata annotations were properly applied to the
# features. Will need the feature metadata file location to be passed here
reference_features = rank_file_to_df(input_ranks_loc)
# Validate some basic properties of the plot
# (This is all handled by Altair, so these property tests aren't
Expand Down Expand Up @@ -315,8 +318,37 @@ def validate_sample_plot_json(biom_table_loc, metadata_loc, sample_json):
assert sample_json["mark"] == "circle"
assert sample_json["title"] == "Log Ratio of Abundances in Samples"
basic_vegalite_json_validation(sample_json)
# dn = sample_json["data"]["name"]
# TODO check that all metadata samples are accounted for in BIOM table
# TODO check that every log ratio is correct? I guess that'll make us
# load the rank plots file, but it's worth it (tm)
dn = sample_json["data"]["name"]

# Check that each sample's metadata in the sample plot JSON matches with
# its actual metadata.
sample_metadata = pd.read_csv(metadata_loc, index_col=0, sep="\t")
for sample in sample_json["datasets"][dn]:
sample_id = sample["Sample ID"]
for metadata_col in sample_metadata.columns:
expected_md = sample_metadata.at[sample_id, metadata_col]
actual_md = sample[metadata_col]
# There are some weird things in how boolean values are
# loaded/generated between the qiime2.Metadata interface,
# pandas.read_csv, json.loads(), and Altair, so to make our lives
# easier we just care about the string representation of booleans.
# (TLDR: pandas.read_csv() results in numpy.bool_ types being
# present, which makes things super fun.)
if str(expected_md) in ["True", "False"]:
assert str(expected_md) == str(actual_md)
else:
assert expected_md == actual_md
# Not really "metadata", but just as a sanity check verify that the
# initial rankratioviz_balance of each sample is null (aka None in
# python) -- this ensures that no samples will show up when the
# visualization is initially displayed, which is the intended behavior.
assert sample["rankratioviz_balance"] is None

# TODO check that every entry (sample x feature) matches with the BIOM
# table. (If the BIOM table has, say, > 1 million entries, this might be
# dumb, but the test data right now is fine.)
# This can be done by using the rankratioviz_feature_col_ids dataset to
# understand what column mappings were set, and then using those column
# mappings to look at the rankratioviz_feature_counts.

# TODO check anything else in the sample plot JSON I'm forgetting

1 comment on commit 9d79b19

@fedarko
Copy link
Collaborator Author

@fedarko fedarko commented on 9d79b19 May 3, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ALSO totally didn't mention this in the commit message, but I accidentally committed some in-progress test code. it's fine tho I'll fix it in a few minutes

Please sign in to comment.