Skip to content

Commit

Permalink
ENH: Store count data in separate JSON: close #102
Browse files Browse the repository at this point in the history
Might be worth eventually making this JSON load separately, or in
a web worker or something. But that's something for #58.
  • Loading branch information
fedarko committed May 13, 2019
1 parent 7ba5207 commit 041aaaf
Show file tree
Hide file tree
Showing 9 changed files with 93 additions and 73 deletions.
60 changes: 43 additions & 17 deletions rankratioviz/_plot_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
import os


def get_plot_jsons(main_js_loc, as_dict=True, return_nones=False):
def get_jsons(main_js_loc, as_dict=True, return_nones=False):
"""Extracts the plot JSONs from a main.js file generated by rankratioviz.
If either JSON isn't available, raises a ValueError unless
Expand All @@ -24,15 +24,18 @@ def get_plot_jsons(main_js_loc, as_dict=True, return_nones=False):
main_js_loc: str
The location of a main.js file, which should contain assignments
to the rankPlotJSON and samplePlotJSON (local) variables. We make
the assumption that the third and fourth lines of this file are
written as follows:
the assumption that some of the lines in this file are written as
follows:
[optional whitespace]var rankPlotJSON = {1};[optional whitespace]
[optional whitespace]var samplePlotJSON = {2};[optional whitespace]
[optional whitespace]var countJSON = {3};[optional whitespace]
where {1} is the rank plot JSON and {2} is the sample plot JSON.
where {1} is the rank plot JSON, {2} is the sample plot JSON, and {3}
is the count JSON (this one doesn't define a plot, it just specifies
the feature counts for each sample).
This function just extracts {1} and {2} and returns the two JSONs
This function just extracts {1}, {2}, and {3} and returns the JSONs
as either dicts or strings.
as_dict: bool
Expand All @@ -47,15 +50,18 @@ def get_plot_jsons(main_js_loc, as_dict=True, return_nones=False):
Returns
-------
(rank_plot, sample_plot): each plot could be a dict, str, or None
(rank_plot, sample_plot, count_json): each plot could be a dict, str,
or None
By default this returns (dict, dict). Passing as_dict=False will
cause (str, str) to be returned instead. Passing return_nones=True
will allow Nones to be in the output (so (None, None), (dict, None),
By default this returns (dict, dict, dict). Passing as_dict=False
will cause (str, str, str) to be returned instead. Passing
return_nones=True will allow Nones to be in the output (so
l(None, None), (dict, None),
(None, str), etc. are all possible outputs).
"""
rank_plot_json_str = None
sample_plot_json_str = None
count_json_str = None
with open(main_js_loc, "r") as mf:
for line in mf:
# Use strip() to trim off starting and trailing whitespace; use the
Expand All @@ -66,9 +72,15 @@ def get_plot_jsons(main_js_loc, as_dict=True, return_nones=False):
rank_plot_json_str = line.strip()[19:-1]
elif line.lstrip().startswith("var samplePlotJSON = "):
sample_plot_json_str = line.strip()[21:-1]
elif line.lstrip().startswith("var countJSON = "):
count_json_str = line.strip()[16:-1]
break

if rank_plot_json_str is None or sample_plot_json_str is None:
if (
rank_plot_json_str is None
or sample_plot_json_str is None
or count_json_str is None
):
if not return_nones:
raise ValueError("Plot JSONs not found in {}.".format(main_js_loc))

Expand All @@ -83,9 +95,10 @@ def str_to_json(s):
return (
str_to_json(rank_plot_json_str),
str_to_json(sample_plot_json_str),
str_to_json(count_json_str),
)
else:
return rank_plot_json_str, sample_plot_json_str
return rank_plot_json_str, sample_plot_json_str, count_json_str


def plot_jsons_equal(json1, json2):
Expand Down Expand Up @@ -120,7 +133,11 @@ def plot_jsons_equal(json1, json2):


def replace_js_plot_json_definitions(
input_file_loc, rank_plot_json, sample_plot_json, output_file_loc=None
input_file_loc,
rank_plot_json,
sample_plot_json,
count_json,
output_file_loc=None,
):
"""Writes a version of the input JS file with plot JSON(s) changed.
Expand All @@ -130,15 +147,15 @@ def replace_js_plot_json_definitions(
output file (or the input file, if output_file_loc is None). This
function will return 1 in this case, and will return 0 otherwise.
Note that the JS variable names defined here (rankPlotJSON and
samplePlotJSON), as well as these variables being defined on
separate lines of the file, are relied on in the python tests when
Note that the JS variable names defined here (rankPlotJSON,
samplePlotJSON, and countJSON), as well as these variables being defined
on separate lines of the file, are relied on in the python tests when
extracting the JSON files from generated main.js files. If you change
the way these variables are written to in the JS, it may cause the
python tests to fail.
"""

curr_rank_plot_json, curr_sample_plot_json = get_plot_jsons(
curr_rank_plot_json, curr_sample_plot_json, curr_count_json = get_jsons(
input_file_loc, return_nones=True
)
output_file_contents = ""
Expand Down Expand Up @@ -166,6 +183,14 @@ def replace_js_plot_json_definitions(
+ ";\n"
)
at_least_one_plot_changed = True
elif line.lstrip().startswith("var countJSON = {"):
if not plot_jsons_equal(curr_count_json, count_json):
output_line = (
output_line[: output_line.index("{")]
+ json.dumps(count_json)
+ ";\n"
)
at_least_one_plot_changed = True
output_file_contents += output_line

if at_least_one_plot_changed:
Expand All @@ -185,7 +210,7 @@ def replace_js_plot_json_definitions(
"""Update JSON plot definitions for the rankratioviz web tests."""
test_dir = os.path.join("rankratioviz", "tests", "web_tests", "tests")
rrv_js_tests = filter(lambda f: f.endswith(".js"), os.listdir(test_dir))
rank_plot_json, sample_plot_json = get_plot_jsons(
rank_plot_json, sample_plot_json, count_json = get_jsons(
os.path.join(
"rankratioviz", "tests", "output", "matching_test", "main.js"
)
Expand All @@ -195,4 +220,5 @@ def replace_js_plot_json_definitions(
os.path.join(test_dir, js_test_file),
rank_plot_json,
sample_plot_json,
count_json,
)
37 changes: 13 additions & 24 deletions rankratioviz/generate.py
Original file line number Diff line number Diff line change
Expand Up @@ -432,28 +432,11 @@ def gen_sample_plot(table, metadata):
)
)

# Save the sample plot JSON. Some notes:
# -From Altair (and Vega)'s perspective, the only "dataset" that directly
# connects to the chart is sample_metadata. This dataset contains the
# "Sample ID" and "rankratioviz_balance" columns, in addition to all of
# the sample metadata columns provided in the input sample metadata.
# -All of the feature counts for each sample (that is, taxon/metabolite
# abundances) are located in the features_ds dataset. These feature counts
# can be drawn on in the JS application when computing log ratios, and
# this lets us search through all available feature IDs/etc. without
# having to worry about accidentally mixing up metadata and feature
# counts.
# -Since feature IDs can be really long (e.g. in the case where the feature
# ID is an entire taxonomy), we convert each feature ID to a string
# integer and refer to that feature by its string integer ID. We store a
# mapping relating actual feature IDs to their string integer IDs under
# the col_ids_ds dataset, which is how we'll determine what to show to
# the user (and link features on the rank plot with feature counts in
# the sample plot) in the JS code.
sample_chart_json = sample_chart.to_dict()
features_ds = "rankratioviz_feature_counts"
sample_chart_json["datasets"][features_ds] = table.to_dict()
return sample_chart_json
# Return the JSONs as dicts for 1) the sample plot JSON (which only
# contains sample metadata), and 2) the feature counts per sample (which
# will be stored separately from the sample plot JSON in order to not hit
# performance too terribly).
return sample_chart.to_dict(), table.to_dict()


def gen_visualization(V, processed_table, df_sample_metadata, output_dir):
Expand All @@ -472,7 +455,9 @@ def gen_visualization(V, processed_table, df_sample_metadata, output_dir):
logging.debug("Generating rank plot JSON.")
rank_plot_json = gen_rank_plot(V)
logging.debug("Generating sample plot JSON.")
sample_plot_json = gen_sample_plot(processed_table, df_sample_metadata)
sample_plot_json, count_json = gen_sample_plot(
processed_table, df_sample_metadata
)
logging.debug("Finished generating both plots.")
os.makedirs(output_dir, exist_ok=True)
# copy files for the visualization
Expand Down Expand Up @@ -509,7 +494,11 @@ def gen_visualization(V, processed_table, df_sample_metadata, output_dir):
main_loc = os.path.join(support_files_loc, "main.js")
output_loc = os.path.join(output_dir, "main.js")
exit_code = replace_js_plot_json_definitions(
main_loc, rank_plot_json, sample_plot_json, output_file_loc=output_loc
main_loc,
rank_plot_json,
sample_plot_json,
count_json,
output_file_loc=output_loc,
)
if exit_code != 0:
raise ValueError("Wasn't able to replace JSONs and write to main.js.")
Expand Down
16 changes: 5 additions & 11 deletions rankratioviz/support_files/js/display.js
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ define(["./feature_computation", "vega", "vega-embed"], function(
* think that would be super useful unless you want to embed
* rankratioviz' web interface in a bunch of other environments.)
*/
constructor(rankPlotJSON, samplePlotJSON) {
constructor(rankPlotJSON, samplePlotJSON, countJSON) {
// Used for selections of log ratios between single features (via
// the rank plot)
this.onHigh = true;
Expand All @@ -45,13 +45,12 @@ define(["./feature_computation", "vega", "vega-embed"], function(
this.botFeatures = undefined;

// Used when looking up a feature's count.
this.feature_cts = undefined;

this.feature_cts = countJSON;
// Used when searching through features.
this.feature_ids = undefined;
this.feature_ids = Object.keys(this.feature_cts);

// Set when the sample plot JSON is loaded. Used to populate possible sample
// plot x-axis/colorization options.
// Set when the sample plot JSON is loaded. Used to populate
// possible sample plot x-axis/colorization options.
this.metadataCols = undefined;

// Ordered list of all ranks
Expand Down Expand Up @@ -277,11 +276,6 @@ define(["./feature_computation", "vega", "vega-embed"], function(
this.samplePlotJSON.encoding.color.field +
"]"
).selected = true;
// TODO don't store this in memory at all? since it's redundant
// with the sample plot JSON, albeit a bit more inconvenient to
// type out to access there.
this.feature_cts = this.samplePlotJSON.datasets.rankratioviz_feature_counts;
this.feature_ids = Object.keys(this.feature_cts);
}
this.updateSamplePlotTooltips();
// NOTE: Use of "patch" based on
Expand Down
3 changes: 2 additions & 1 deletion rankratioviz/support_files/main.js
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ requirejs(
// generated by Altair.
var rankPlotJSON = {};
var samplePlotJSON = {};
rrv = new display.RRVDisplay(rankPlotJSON, samplePlotJSON);
var countJSON = {};
rrv = new display.RRVDisplay(rankPlotJSON, samplePlotJSON, countJSON);
}
);
7 changes: 3 additions & 4 deletions rankratioviz/tests/test_matching.py
Original file line number Diff line number Diff line change
Expand Up @@ -203,7 +203,7 @@ def test_feature_metadata_and_dropped_sample():
ranks, and the BIOM table together.
"""

rank_json, sample_json = run_integration_test(
rank_json, sample_json, count_json = run_integration_test(
"matching_test",
"matching_test",
"differentials.tsv",
Expand Down Expand Up @@ -238,10 +238,9 @@ def test_feature_metadata_and_dropped_sample():
for sample in sample_json["datasets"][data_name]:
assert sample["Sample ID"] != "Sample4"

cts_data = sample_json["datasets"]["rankratioviz_feature_counts"]
for txid in cts_data:
for txid in count_json:
# Assert that Sample4 was also dropped from the counts data in the JSON
assert "Sample4" not in cts_data[txid]
assert "Sample4" not in count_json[txid]
# Assert that Taxon3's annotation carried over to the sample plot
if txid.startswith("Taxon3"):
for fm in relevant_feature_metadata:
Expand Down
25 changes: 14 additions & 11 deletions rankratioviz/tests/testing_utilities.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
import rankratioviz.scripts._plot as rrvp
from rankratioviz._rank_utils import read_rank_file
from rankratioviz._metadata_utils import read_metadata_file
from rankratioviz._plot_utils import get_plot_jsons
from rankratioviz._plot_utils import get_jsons


def run_integration_test(
Expand Down Expand Up @@ -94,8 +94,10 @@ def run_integration_test(
if expect_all_unsupported_samples or expected_unsupported_features > 0:
return None, None
else:
rank_json, sample_json = validate_main_js(out_dir, rloc, tloc, sloc)
return rank_json, sample_json
rank_json, sample_json, count_json = validate_main_js(
out_dir, rloc, tloc, sloc
)
return rank_json, sample_json, count_json


def validate_standalone_result(
Expand Down Expand Up @@ -178,13 +180,13 @@ def validate_main_js(out_dir, rloc, tloc, sloc):
"""

main_loc = os.path.join(out_dir, "main.js")
rank_json, sample_json = get_plot_jsons(main_loc)
rank_json, sample_json, count_json = get_jsons(main_loc)

# Validate plot JSONs
validate_rank_plot_json(rloc, rank_json)
validate_sample_plot_json(tloc, sloc, sample_json)
validate_sample_plot_json(tloc, sloc, sample_json, count_json)

return rank_json, sample_json
return rank_json, sample_json, count_json


def validate_samples_supported_output(output, expected_unsupported_samples):
Expand Down Expand Up @@ -280,7 +282,9 @@ def validate_rank_plot_json(input_ranks_loc, rank_json):
prev_x_val = feature["rankratioviz_x"]


def validate_sample_plot_json(biom_table_loc, metadata_loc, sample_json):
def validate_sample_plot_json(
biom_table_loc, metadata_loc, sample_json, count_json
):
assert sample_json["mark"] == "circle"
assert sample_json["title"] == "Log Ratio of Abundances in Samples"
basic_vegalite_json_validation(sample_json)
Expand Down Expand Up @@ -308,17 +312,16 @@ def validate_sample_plot_json(biom_table_loc, metadata_loc, sample_json):
# If the BIOM table has, say, > 1 million entries, this might be excessive,
# but the test data right now is small enough that this should be fine.
table = load_table(biom_table_loc)
counts = sample_json["datasets"]["rankratioviz_feature_counts"]

# For each (ranked) feature...
for feature_id in counts:
for feature_id in count_json:
# Get its base ID (the ID it is referred to by in the input BIOM table
# and feature rankings file), and its column ID (the integer ID it's
# referred to by in the JSON count data).
feature_base_id = feature_id.split("|")[0]
# For each sample, ensure that the count value in the JSON matches with
# the count value in the BIOM table.
for sample_id in counts[feature_id]:
actual_count = counts[feature_id][sample_id]
for sample_id in count_json[feature_id]:
actual_count = count_json[feature_id][sample_id]
expected_count = table.get_value_by_ids(feature_base_id, sample_id)
assert actual_count == expected_count
Loading

0 comments on commit 041aaaf

Please sign in to comment.