ENH: Store count data in separate JSON: close #102

Might be worth eventually making this JSON load separately, or in a web worker or something. But that's something for #58.
biocore · May 13, 2019 · 041aaaf · 041aaaf
1 parent 7ba5207
commit 041aaaf
Show file tree

Hide file tree

Showing 9 changed files with 93 additions and 73 deletions.
diff --git a/rankratioviz/_plot_utils.py b/rankratioviz/_plot_utils.py
@@ -12,7 +12,7 @@
 import os
 
 
-def get_plot_jsons(main_js_loc, as_dict=True, return_nones=False):
+def get_jsons(main_js_loc, as_dict=True, return_nones=False):
     """Extracts the plot JSONs from a main.js file generated by rankratioviz.
 
        If either JSON isn't available, raises a ValueError unless
@@ -24,15 +24,18 @@ def get_plot_jsons(main_js_loc, as_dict=True, return_nones=False):
        main_js_loc: str
           The location of a main.js file, which should contain assignments
           to the rankPlotJSON and samplePlotJSON (local) variables. We make
-          the assumption that the third and fourth lines of this file are
-          written as follows:
+          the assumption that some of the lines in this file are written as
+          follows:
 
           [optional whitespace]var rankPlotJSON = {1};[optional whitespace]
           [optional whitespace]var samplePlotJSON = {2};[optional whitespace]
+          [optional whitespace]var countJSON = {3};[optional whitespace]
 
-          where {1} is the rank plot JSON and {2} is the sample plot JSON.
+          where {1} is the rank plot JSON, {2} is the sample plot JSON, and {3}
+          is the count JSON (this one doesn't define a plot, it just specifies
+          the feature counts for each sample).
 
-          This function just extracts {1} and {2} and returns the two JSONs
+          This function just extracts {1}, {2}, and {3} and returns the JSONs
           as either dicts or strings.
 
        as_dict: bool
@@ -47,15 +50,18 @@ def get_plot_jsons(main_js_loc, as_dict=True, return_nones=False):
 
        Returns
        -------
-       (rank_plot, sample_plot): each plot could be a dict, str, or None
+       (rank_plot, sample_plot, count_json): each plot could be a dict, str,
+                                             or None
 
-          By default this returns (dict, dict). Passing as_dict=False will
-          cause (str, str) to be returned instead. Passing return_nones=True
-          will allow Nones to be in the output (so (None, None), (dict, None),
+          By default this returns (dict, dict, dict). Passing as_dict=False
+          will cause (str, str, str) to be returned instead. Passing
+          return_nones=True will allow Nones to be in the output (so
+          l(None, None), (dict, None),
           (None, str), etc. are all possible outputs).
     """
     rank_plot_json_str = None
     sample_plot_json_str = None
+    count_json_str = None
     with open(main_js_loc, "r") as mf:
         for line in mf:
             # Use strip() to trim off starting and trailing whitespace; use the
@@ -66,9 +72,15 @@ def get_plot_jsons(main_js_loc, as_dict=True, return_nones=False):
                 rank_plot_json_str = line.strip()[19:-1]
             elif line.lstrip().startswith("var samplePlotJSON = "):
                 sample_plot_json_str = line.strip()[21:-1]
+            elif line.lstrip().startswith("var countJSON = "):
+                count_json_str = line.strip()[16:-1]
                 break
 
-    if rank_plot_json_str is None or sample_plot_json_str is None:
+    if (
+        rank_plot_json_str is None
+        or sample_plot_json_str is None
+        or count_json_str is None
+    ):
         if not return_nones:
             raise ValueError("Plot JSONs not found in {}.".format(main_js_loc))
 
@@ -83,9 +95,10 @@ def str_to_json(s):
         return (
             str_to_json(rank_plot_json_str),
             str_to_json(sample_plot_json_str),
+            str_to_json(count_json_str),
         )
     else:
-        return rank_plot_json_str, sample_plot_json_str
+        return rank_plot_json_str, sample_plot_json_str, count_json_str
 
 
 def plot_jsons_equal(json1, json2):
@@ -120,7 +133,11 @@ def plot_jsons_equal(json1, json2):
 
 
 def replace_js_plot_json_definitions(
-    input_file_loc, rank_plot_json, sample_plot_json, output_file_loc=None
+    input_file_loc,
+    rank_plot_json,
+    sample_plot_json,
+    count_json,
+    output_file_loc=None,
 ):
     """Writes a version of the input JS file with plot JSON(s) changed.
 
@@ -130,15 +147,15 @@ def replace_js_plot_json_definitions(
        output file (or the input file, if output_file_loc is None). This
        function will return 1 in this case, and will return 0 otherwise.
 
-       Note that the JS variable names defined here (rankPlotJSON and
-       samplePlotJSON), as well as these variables being defined on
-       separate lines of the file, are relied on in the python tests when
+       Note that the JS variable names defined here (rankPlotJSON,
+       samplePlotJSON, and countJSON), as well as these variables being defined
+       on separate lines of the file, are relied on in the python tests when
        extracting the JSON files from generated main.js files. If you change
        the way these variables are written to in the JS, it may cause the
        python tests to fail.
     """
 
-    curr_rank_plot_json, curr_sample_plot_json = get_plot_jsons(
+    curr_rank_plot_json, curr_sample_plot_json, curr_count_json = get_jsons(
         input_file_loc, return_nones=True
     )
     output_file_contents = ""
@@ -166,6 +183,14 @@ def replace_js_plot_json_definitions(
                         + ";\n"
                     )
                     at_least_one_plot_changed = True
+            elif line.lstrip().startswith("var countJSON = {"):
+                if not plot_jsons_equal(curr_count_json, count_json):
+                    output_line = (
+                        output_line[: output_line.index("{")]
+                        + json.dumps(count_json)
+                        + ";\n"
+                    )
+                    at_least_one_plot_changed = True
             output_file_contents += output_line
 
     if at_least_one_plot_changed:
@@ -185,7 +210,7 @@ def replace_js_plot_json_definitions(
     """Update JSON plot definitions for the rankratioviz web tests."""
     test_dir = os.path.join("rankratioviz", "tests", "web_tests", "tests")
     rrv_js_tests = filter(lambda f: f.endswith(".js"), os.listdir(test_dir))
-    rank_plot_json, sample_plot_json = get_plot_jsons(
+    rank_plot_json, sample_plot_json, count_json = get_jsons(
         os.path.join(
             "rankratioviz", "tests", "output", "matching_test", "main.js"
         )
@@ -195,4 +220,5 @@ def replace_js_plot_json_definitions(
             os.path.join(test_dir, js_test_file),
             rank_plot_json,
             sample_plot_json,
+            count_json,
         )
diff --git a/rankratioviz/generate.py b/rankratioviz/generate.py
@@ -432,28 +432,11 @@ def gen_sample_plot(table, metadata):
         )
     )
 
-    # Save the sample plot JSON. Some notes:
-    # -From Altair (and Vega)'s perspective, the only "dataset" that directly
-    #  connects to the chart is sample_metadata. This dataset contains the
-    #  "Sample ID" and "rankratioviz_balance" columns, in addition to all of
-    #  the sample metadata columns provided in the input sample metadata.
-    # -All of the feature counts for each sample (that is, taxon/metabolite
-    #  abundances) are located in the features_ds dataset. These feature counts
-    #  can be drawn on in the JS application when computing log ratios, and
-    #  this lets us search through all available feature IDs/etc. without
-    #  having to worry about accidentally mixing up metadata and feature
-    #  counts.
-    # -Since feature IDs can be really long (e.g. in the case where the feature
-    #  ID is an entire taxonomy), we convert each feature ID to a string
-    #  integer and refer to that feature by its string integer ID. We store a
-    #  mapping relating actual feature IDs to their string integer IDs under
-    #  the col_ids_ds dataset, which is how we'll determine what to show to
-    #  the user (and link features on the rank plot with feature counts in
-    #  the sample plot) in the JS code.
-    sample_chart_json = sample_chart.to_dict()
-    features_ds = "rankratioviz_feature_counts"
-    sample_chart_json["datasets"][features_ds] = table.to_dict()
-    return sample_chart_json
+    # Return the JSONs as dicts for 1) the sample plot JSON (which only
+    # contains sample metadata), and 2) the feature counts per sample (which
+    # will be stored separately from the sample plot JSON in order to not hit
+    # performance too terribly).
+    return sample_chart.to_dict(), table.to_dict()
 
 
 def gen_visualization(V, processed_table, df_sample_metadata, output_dir):
@@ -472,7 +455,9 @@ def gen_visualization(V, processed_table, df_sample_metadata, output_dir):
     logging.debug("Generating rank plot JSON.")
     rank_plot_json = gen_rank_plot(V)
     logging.debug("Generating sample plot JSON.")
-    sample_plot_json = gen_sample_plot(processed_table, df_sample_metadata)
+    sample_plot_json, count_json = gen_sample_plot(
+        processed_table, df_sample_metadata
+    )
     logging.debug("Finished generating both plots.")
     os.makedirs(output_dir, exist_ok=True)
     # copy files for the visualization
@@ -509,7 +494,11 @@ def gen_visualization(V, processed_table, df_sample_metadata, output_dir):
     main_loc = os.path.join(support_files_loc, "main.js")
     output_loc = os.path.join(output_dir, "main.js")
     exit_code = replace_js_plot_json_definitions(
-        main_loc, rank_plot_json, sample_plot_json, output_file_loc=output_loc
+        main_loc,
+        rank_plot_json,
+        sample_plot_json,
+        count_json,
+        output_file_loc=output_loc,
     )
     if exit_code != 0:
         raise ValueError("Wasn't able to replace JSONs and write to main.js.")

diff --git a/rankratioviz/support_files/js/display.js b/rankratioviz/support_files/js/display.js
@@ -31,7 +31,7 @@ define(["./feature_computation", "vega", "vega-embed"], function(
          * think that would be super useful unless you want to embed
          * rankratioviz' web interface in a bunch of other environments.)
          */
-        constructor(rankPlotJSON, samplePlotJSON) {
+        constructor(rankPlotJSON, samplePlotJSON, countJSON) {
             // Used for selections of log ratios between single features (via
             // the rank plot)
             this.onHigh = true;
@@ -45,13 +45,12 @@ define(["./feature_computation", "vega", "vega-embed"], function(
             this.botFeatures = undefined;
 
             // Used when looking up a feature's count.
-            this.feature_cts = undefined;
-
+            this.feature_cts = countJSON;
             // Used when searching through features.
-            this.feature_ids = undefined;
+            this.feature_ids = Object.keys(this.feature_cts);
 
-            // Set when the sample plot JSON is loaded. Used to populate possible sample
-            // plot x-axis/colorization options.
+            // Set when the sample plot JSON is loaded. Used to populate
+            // possible sample plot x-axis/colorization options.
             this.metadataCols = undefined;
 
             // Ordered list of all ranks
@@ -277,11 +276,6 @@ define(["./feature_computation", "vega", "vega-embed"], function(
                             this.samplePlotJSON.encoding.color.field +
                             "]"
                     ).selected = true;
-                // TODO don't store this in memory at all? since it's redundant
-                // with the sample plot JSON, albeit a bit more inconvenient to
-                // type out to access there.
-                this.feature_cts = this.samplePlotJSON.datasets.rankratioviz_feature_counts;
-                this.feature_ids = Object.keys(this.feature_cts);
             }
             this.updateSamplePlotTooltips();
             // NOTE: Use of "patch" based on

diff --git a/rankratioviz/support_files/main.js b/rankratioviz/support_files/main.js
@@ -20,6 +20,7 @@ requirejs(
         // generated by Altair.
         var rankPlotJSON = {};
         var samplePlotJSON = {};
-        rrv = new display.RRVDisplay(rankPlotJSON, samplePlotJSON);
+        var countJSON = {};
+        rrv = new display.RRVDisplay(rankPlotJSON, samplePlotJSON, countJSON);
     }
 );
diff --git a/rankratioviz/tests/test_matching.py b/rankratioviz/tests/test_matching.py
@@ -203,7 +203,7 @@ def test_feature_metadata_and_dropped_sample():
        ranks, and the BIOM table together.
     """
 
-    rank_json, sample_json = run_integration_test(
+    rank_json, sample_json, count_json = run_integration_test(
         "matching_test",
         "matching_test",
         "differentials.tsv",
@@ -238,10 +238,9 @@ def test_feature_metadata_and_dropped_sample():
     for sample in sample_json["datasets"][data_name]:
         assert sample["Sample ID"] != "Sample4"
 
-    cts_data = sample_json["datasets"]["rankratioviz_feature_counts"]
-    for txid in cts_data:
+    for txid in count_json:
         # Assert that Sample4 was also dropped from the counts data in the JSON
-        assert "Sample4" not in cts_data[txid]
+        assert "Sample4" not in count_json[txid]
         # Assert that Taxon3's annotation carried over to the sample plot
         if txid.startswith("Taxon3"):
             for fm in relevant_feature_metadata:

diff --git a/rankratioviz/tests/testing_utilities.py b/rankratioviz/tests/testing_utilities.py
@@ -7,7 +7,7 @@
 import rankratioviz.scripts._plot as rrvp
 from rankratioviz._rank_utils import read_rank_file
 from rankratioviz._metadata_utils import read_metadata_file
-from rankratioviz._plot_utils import get_plot_jsons
+from rankratioviz._plot_utils import get_jsons
 
 
 def run_integration_test(
@@ -94,8 +94,10 @@ def run_integration_test(
     if expect_all_unsupported_samples or expected_unsupported_features > 0:
         return None, None
     else:
-        rank_json, sample_json = validate_main_js(out_dir, rloc, tloc, sloc)
-        return rank_json, sample_json
+        rank_json, sample_json, count_json = validate_main_js(
+            out_dir, rloc, tloc, sloc
+        )
+        return rank_json, sample_json, count_json
 
 
 def validate_standalone_result(
@@ -178,13 +180,13 @@ def validate_main_js(out_dir, rloc, tloc, sloc):
     """
 
     main_loc = os.path.join(out_dir, "main.js")
-    rank_json, sample_json = get_plot_jsons(main_loc)
+    rank_json, sample_json, count_json = get_jsons(main_loc)
 
     # Validate plot JSONs
     validate_rank_plot_json(rloc, rank_json)
-    validate_sample_plot_json(tloc, sloc, sample_json)
+    validate_sample_plot_json(tloc, sloc, sample_json, count_json)
 
-    return rank_json, sample_json
+    return rank_json, sample_json, count_json
 
 
 def validate_samples_supported_output(output, expected_unsupported_samples):
@@ -280,7 +282,9 @@ def validate_rank_plot_json(input_ranks_loc, rank_json):
         prev_x_val = feature["rankratioviz_x"]
 
 
-def validate_sample_plot_json(biom_table_loc, metadata_loc, sample_json):
+def validate_sample_plot_json(
+    biom_table_loc, metadata_loc, sample_json, count_json
+):
     assert sample_json["mark"] == "circle"
     assert sample_json["title"] == "Log Ratio of Abundances in Samples"
     basic_vegalite_json_validation(sample_json)
@@ -308,17 +312,16 @@ def validate_sample_plot_json(biom_table_loc, metadata_loc, sample_json):
     # If the BIOM table has, say, > 1 million entries, this might be excessive,
     # but the test data right now is small enough that this should be fine.
     table = load_table(biom_table_loc)
-    counts = sample_json["datasets"]["rankratioviz_feature_counts"]
 
     # For each (ranked) feature...
-    for feature_id in counts:
+    for feature_id in count_json:
         # Get its base ID (the ID it is referred to by in the input BIOM table
         # and feature rankings file), and its column ID (the integer ID it's
         # referred to by in the JSON count data).
         feature_base_id = feature_id.split("|")[0]
         # For each sample, ensure that the count value in the JSON matches with
         # the count value in the BIOM table.
-        for sample_id in counts[feature_id]:
-            actual_count = counts[feature_id][sample_id]
+        for sample_id in count_json[feature_id]:
+            actual_count = count_json[feature_id][sample_id]
             expected_count = table.get_value_by_ids(feature_base_id, sample_id)
             assert actual_count == expected_count