MAINT: reorder process_input stuff; work w/ BIOMs

Turns out that we need to delay matching until after filtering out unextreme features in order to avoid computing the intersection of two super-huge indices (EMP has on the order of 200k observations and trying to match those up is going to be horrible). And it seems like _df_utils.remove_empty_samples() was super slow on even SparseDataFrames. To make things easier -- and because I know from experience that it worked, and let Qurro handle huge EMP-scale amounts of data -- I reorganized things so that we first filter out ranks and empty samples, THEN do matching. This works, but it breaks a lot of the unit tests (and some of the integration tests that rely on specific error messages that are now changed). Need to fix these then double check that everything works properly. This is a prereq for #58. I have an alg for that sketched out, also.
biocore · Jul 2, 2019 · a7f4b7d · a7f4b7d
1 parent 12c5c4a
commit a7f4b7d
Show file tree

Hide file tree

Showing 6 changed files with 104 additions and 90 deletions.
diff --git a/qurro/_df_utils.py b/qurro/_df_utils.py
@@ -145,46 +145,39 @@ def biom_table_to_sparse_df(table, min_row_ct=2, min_col_ct=1):
     return table_sdf
 
 
-def remove_empty_samples(table_sdf, sample_metadata_df):
-    """Removes samples with 0 counts for every feature from the table and
-       sample metadata DataFrame.
-
-       This should be called *after* matching the table with the sample
-       metadata -- we assume that the columns of the table DataFrame are
-       equivalent to the indices of the sample metadata DataFrame.
-
-       This will raise a ValueError if, after removing empty samples, either
-       the table's columns or the metadata's indices are empty (this will
-       happen in the case where all of the samples in these DataFrames are
-       empty).
+def remove_empty_samples(biom_table):
+    """Removes samples with 0 counts for every feature from a BIOM table.
+
+       This will raise a ValueError if, after removing empty samples, the
+       table's columns are empty (this will happen if all of the samples in
+       the table are empty).
     """
     logging.debug("Attempting to remove empty samples.")
-    table_df_equal_to_zero = table_sdf == 0
-    nonempty_samples = []
-    for sample in table_sdf.columns:
-        if not table_df_equal_to_zero[sample].all():
-            nonempty_samples.append(sample)
-
-    filtered_table = table_sdf.filter(items=nonempty_samples, axis="columns")
-    filtered_metadata = sample_metadata_df.filter(
-        items=nonempty_samples, axis="index"
-    )
+    filtered_table = biom_table.remove_empty(axis="sample", inplace=False)
 
-    if len(filtered_table.columns) < 1 or len(filtered_metadata.index) < 1:
+    if filtered_table.shape[1] < 1:
         raise ValueError("Found all empty samples with current features.")
 
-    sample_diff = len(table_sdf.columns) - len(filtered_table.columns)
+    sample_diff = filtered_table.shape[1] - biom_table.shape[1]
     if sample_diff > 0:
         logging.debug("Removed {} empty sample(s).".format(sample_diff))
     else:
-        logging.debug("Couldn't find any empty samples.")
+        logging.debug("Couldn't find any empty samples to remove.")
 
-    return filtered_table, filtered_metadata
+    return filtered_table
 
 
 def match_table_and_data(table, feature_ranks, sample_metadata):
     """Matches feature rankings and then sample metadata to a table.
 
+       This should bring us to a point where every specified feature/sample is
+       supported in the output table DataFrame.
+
+       Note that the input table here might contain features or samples that
+       are not included in feature_ranks or sample_metadata, respectively --
+       this is totally fine. However, errors may be raised if the opposite is
+       true; see the "Raises" section below for details.
+
        Parameters
        ----------
 
@@ -225,12 +218,11 @@ def match_table_and_data(table, feature_ranks, sample_metadata):
        in the table, this will raise a ValueError.
     """
     logging.debug("Starting matching table with feature/sample data.")
-    # Match features to BIOM table, and then match samples to BIOM table.
-    # This should bring us to a point where every feature/sample is
-    # supported in the BIOM table. (Note that the input BIOM table might
-    # contain features or samples that are not included in feature_ranks or
-    # sample_metadata, respectively -- this is totally fine. The opposite,
-    # though, is a big no-no.)
+    # NOTE: if we actually did filtering in filter_unextreme_features, then
+    # this is an unnecessary step. TODO: make note of this and avoid this
+    # unnecessary operation in that case?
+    logging.debug("Starting matching table with feature rankings.")
+
     featurefiltered_table, m_feature_ranks = matchdf(table, feature_ranks)
     logging.debug("Matching table with feature ranks done.")
     # Ensure that every ranked feature was present in the BIOM table. Raise an
@@ -244,8 +236,8 @@ def match_table_and_data(table, feature_ranks, sample_metadata):
         if unsupported_feature_ct == 1:
             word = "was"
         raise ValueError(
-            "Of the {} ranked features, {} {} not present in "
-            "the input BIOM table.".format(
+            "Of {} ranked features, {} {} not present in "
+            "the BIOM table.".format(
                 feature_ranks.shape[0], unsupported_feature_ct, word
             )
         )
@@ -260,7 +252,7 @@ def match_table_and_data(table, feature_ranks, sample_metadata):
     if m_sample_metadata.shape[0] < 1:
         raise ValueError(
             "None of the samples in the sample metadata file "
-            "are present in the input BIOM table."
+            "are present in the BIOM table."
         )
 
     dropped_sample_ct = sample_metadata.shape[0] - m_sample_metadata.shape[0]

diff --git a/qurro/_rank_utils.py b/qurro/_rank_utils.py
@@ -8,6 +8,7 @@
 # ----------------------------------------------------------------------------
 
 import logging
+import biom
 import skbio
 import pandas as pd
 from qurro._df_utils import escape_columns
@@ -89,7 +90,7 @@ def differentials_to_df(differentials_loc):
 
 
 def filter_unextreme_features(
-    table: pd.SparseDataFrame,
+    table: biom.Table,
     ranks: pd.DataFrame,
     extreme_feature_count: int,
     print_warning: bool = True,
@@ -99,10 +100,10 @@ def filter_unextreme_features(
        Parameters
        ----------
 
-       table: pd.SparseDataFrame
-            A SparseDataFrame representation of a BIOM table. This can be
-            generated easily from a biom.Table object using
-            qurro._df_utils.biom_table_to_sparse_df().
+       table: biom.Table
+            A BIOM table for the dataset.
+            This checks to make sure that the remaining "extreme" features are
+            all in the table -- if not, then this throws a ValueError.
 
        ranks: pandas.DataFrame
             A DataFrame where the index consists of ranked features' IDs, and
@@ -123,8 +124,8 @@ def filter_unextreme_features(
        Returns
        -------
 
-       (table, ranks): (pandas.SparseDataFrame, pandas.DataFrame)
-            Filtered copies of the input table and ranks DataFrames.
+       (table, ranks): (biom.Table, pandas.DataFrame)
+            Filtered copies of the input BIOM table and feature ranking DF.
 
        Behavior
        --------
@@ -182,6 +183,7 @@ def filter_unextreme_features(
     )
     logging.debug("Input table has shape {}.".format(table.shape))
     logging.debug("Input feature ranks have shape {}.".format(ranks.shape))
+
     # We store these features in a set to avoid duplicates -- Python does the
     # hard work here for us
     features_to_preserve = set()
@@ -193,7 +195,26 @@ def filter_unextreme_features(
 
     # Also filter ranks. Fortunately, DataFrame.filter() makes this easy.
     filtered_ranks = ranks.filter(items=features_to_preserve, axis="index")
-    filtered_table = table.filter(items=features_to_preserve, axis="index")
+
+    # Filter the BIOM table to desired features.
+    def filter_biom_table(values, feature_id, _):
+        return feature_id in features_to_preserve
+
+    filtered_table = table.filter(
+        filter_biom_table, axis="observation", inplace=False
+    )
+
+    # Since Qurro filters unextreme features before matching the table with the
+    # feature ranks, there's the possibility that all of the features that we
+    # filtered the table to are not actually *present* in the table. So we need
+    # to quickly verify that the table contains all of the "extreme" features.
+    table_feature_ct = filtered_table.shape[0]
+    ranks_feature_ct = len(filtered_ranks.index)
+    if table_feature_ct < ranks_feature_ct:
+        raise ValueError(
+            '{} "extreme" ranked feature(s) were not present in '
+            "the input BIOM table.".format(ranks_feature_ct - table_feature_ct)
+        )
 
     logging.debug("Output table has shape {}.".format(filtered_table.shape))
     logging.debug(

diff --git a/qurro/generate.py b/qurro/generate.py
@@ -130,36 +130,34 @@ def process_input(
     if feature_metadata is not None:
         feature_metadata = replace_nan(feature_metadata)
 
-    table = biom_table_to_sparse_df(biom_table)
-
-    # Match up the table with the feature ranks and sample metadata.
-    m_table, m_sample_metadata = match_table_and_data(
-        table, feature_ranks, sample_metadata
-    )
-
     # Note that although we always call filter_unextreme_features(), filtering
     # isn't necessarily always done (whether or not depends on the value of
     # extreme_feature_count and the contents of the table/ranks).
-    filtered_table, filtered_ranks = filter_unextreme_features(
-        m_table, feature_ranks, extreme_feature_count
+    filtered_biom_table, filtered_ranks = filter_unextreme_features(
+        biom_table, feature_ranks, extreme_feature_count
     )
 
     # Filter now-empty samples from the BIOM table.
-    output_table, output_metadata = remove_empty_samples(
-        filtered_table, m_sample_metadata
+    filtered_biom_table = remove_empty_samples(filtered_biom_table)
+
+    unmatched_table = biom_table_to_sparse_df(filtered_biom_table)
+
+    # Match up the table with the feature ranks and sample metadata.
+    output_table, output_metadata = match_table_and_data(
+        unmatched_table, filtered_ranks, sample_metadata
     )
 
     # Save a list of ranking IDs (before we add in feature metadata)
     ranking_ids = filtered_ranks.columns
 
-    filtered_ranks, feature_metadata_cols = merge_feature_metadata(
+    output_ranks, feature_metadata_cols = merge_feature_metadata(
         filtered_ranks, feature_metadata
     )
 
     logging.debug("Finished input processing.")
     return (
         output_metadata,
-        filtered_ranks,
+        output_ranks,
         ranking_ids,
         feature_metadata_cols,
         output_table,

diff --git a/qurro/tests/input/moving_pictures/biplot.qzv b/qurro/tests/input/moving_pictures/biplot.qzv
diff --git a/qurro/tests/input/moving_pictures/qurro-plot.qzv b/qurro/tests/input/moving_pictures/qurro-plot.qzv
diff --git a/qurro/tests/test_filter_unextreme_features.py b/qurro/tests/test_filter_unextreme_features.py
@@ -1,10 +1,10 @@
 import biom
-from numpy import arange
+import numpy as np
+from numpy.testing import assert_array_equal
 from pandas import DataFrame
 from pandas.testing import assert_frame_equal
 import pytest
 from qurro._rank_utils import filter_unextreme_features
-from qurro.generate import biom_table_to_sparse_df
 
 
 def get_test_data():
@@ -22,58 +22,61 @@ def get_test_data():
     # Based on the BIOM docs' example of initialization using a np ndarray --
     # http://biom-format.org/documentation/table_objects.html#examples
     #
-    # arange(40) generates a numpy ndarray that just goes from 0 to 39 (i.e.
+    # np.arange(40) generates a numpy ndarray that just goes from 0 to 39 (i.e.
     # contains 40 numbers). We reshape this ndarray to give it a sort of
     # "tabular" structure (a 2-D array containing 8 arrays, each with 5
     # numbers).
-    underlying_table_data = arange(40).reshape(8, 5)
+    underlying_table_data = np.arange(40).reshape(8, 5)
     # Set the third sample in the data to contain all zeros, except for a
     # count for F4 (so we can test what this function does with so-called
     # "empty" samples after filtering out F4).
     underlying_table_data[:, 2] = 0.0
     underlying_table_data[3, 2] = 1.0
     # Finally, use the data to create a BIOM table object.
     biom_table = biom.Table(underlying_table_data, feature_ids, sample_ids)
-    # ...And yeah we're actually making it into a Sparse DF because that's what
-    # I changed filter_unextreme_features() to expect now.
-    # (TODO: simplify this code in the future?)
-    output_table = biom_table_to_sparse_df(biom_table)
 
-    return output_table, ranks
+    return biom_table, ranks
 
 
 def test_filtering_basic():
     """Tests the standard behavior of filter_unextreme_features()."""
 
-    table, ranks = get_test_data()
+    biom_table, ranks = get_test_data()
     filtered_table, filtered_ranks = filter_unextreme_features(
-        table, ranks, 2, print_warning=False
+        biom_table, ranks, 2, print_warning=False
     )
     # Check that the appropriate features/samples were filtered out of the
     # table. NOTE -- I know this is sloppy code. Would like to fix it in the
     # future.
     for fid in ["F1", "F2", "F7", "F8"]:
-        assert fid in filtered_table.index
+        assert filtered_table.exists(fid, axis="observation")
     for fid in ["F3", "F4", "F5", "F6"]:
-        assert fid not in filtered_table.index
+        assert not filtered_table.exists(fid, axis="observation")
     # Check that all samples were preserved.
     # (The removal of empty features is done *after*
     # filter_unextreme_features() is called in normal Qurro execution, so we
     # should expect all samples -- even empty ones -- to remain here.
     for sid in ["S1", "S2", "S3", "S4", "S5"]:
-        assert sid in filtered_table.columns
-
+        assert filtered_table.exists(sid, axis="sample")
     # Check that the appropriate data is left in the table.
-    assert list(filtered_table.loc["F1"]) == [0, 1, 0, 3, 4]
-    assert list(filtered_table.loc["F2"]) == [5, 6, 0, 8, 9]
-    assert list(filtered_table.loc["F7"]) == [30, 31, 0, 33, 34]
-    assert list(filtered_table.loc["F8"]) == [35, 36, 0, 38, 39]
+    assert_array_equal(
+        filtered_table.data("F1", axis="observation"), [0, 1, 0, 3, 4]
+    )
+    assert_array_equal(
+        filtered_table.data("F2", axis="observation"), [5, 6, 0, 8, 9]
+    )
+    assert_array_equal(
+        filtered_table.data("F7", axis="observation"), [30, 31, 0, 33, 34]
+    )
+    assert_array_equal(
+        filtered_table.data("F8", axis="observation"), [35, 36, 0, 38, 39]
+    )
 
-    # Check that the rank filtering worked as expected.
     expected_filtered_ranks = DataFrame(
         {"Rank 0": [1, 2, 7, 8], "Rank 1": [8, 7, 2, 1]},
         index=["F1", "F2", "F7", "F8"],
     )
+    # Check that the rank filtering worked as expected.
     assert_frame_equal(
         filtered_ranks, expected_filtered_ranks, check_like=True
     )
@@ -84,19 +87,19 @@ def test_filtering_large_efc():
        is greater than or equal to the number of ranked features.
     """
 
-    table, ranks = get_test_data()
+    biom_table, ranks = get_test_data()
 
     # The number of ranked features is 8.
     filtered_table, filtered_ranks = filter_unextreme_features(
-        table, ranks, 4, print_warning=False
+        biom_table, ranks, 4, print_warning=False
     )
-    assert_frame_equal(table, filtered_table)
+    assert biom_table == filtered_table
     assert_frame_equal(ranks, filtered_ranks)
 
     filtered_table, filtered_ranks = filter_unextreme_features(
-        table, ranks, 8, print_warning=False
+        biom_table, ranks, 8, print_warning=False
     )
-    assert_frame_equal(table, filtered_table)
+    assert biom_table == filtered_table
     assert_frame_equal(ranks, filtered_ranks)
 
 
@@ -106,12 +109,12 @@ def test_filtering_no_efc():
        done).
     """
 
-    table, ranks = get_test_data()
+    biom_table, ranks = get_test_data()
 
     filtered_table, filtered_ranks = filter_unextreme_features(
-        table, ranks, None, print_warning=False
+        biom_table, ranks, None, print_warning=False
     )
-    assert_frame_equal(table, filtered_table)
+    assert biom_table == filtered_table
     assert_frame_equal(ranks, filtered_ranks)
 
 
@@ -120,19 +123,19 @@ def test_filtering_invalid_efc():
        extreme feature count is less than 1 and/or not an integer.
     """
 
-    table, ranks = get_test_data()
+    biom_table, ranks = get_test_data()
 
     with pytest.raises(ValueError):
-        filter_unextreme_features(table, ranks, 0)
+        filter_unextreme_features(biom_table, ranks, 0)
 
     with pytest.raises(ValueError):
-        filter_unextreme_features(table, ranks, -1)
+        filter_unextreme_features(biom_table, ranks, -1)
 
     with pytest.raises(ValueError):
-        filter_unextreme_features(table, ranks, -2)
+        filter_unextreme_features(biom_table, ranks, -2)
 
     with pytest.raises(ValueError):
-        filter_unextreme_features(table, ranks, 1.5)
+        filter_unextreme_features(biom_table, ranks, 1.5)
 
     with pytest.raises(ValueError):
-        filter_unextreme_features(table, ranks, 5.5)
+        filter_unextreme_features(biom_table, ranks, 5.5)