Skip to content

Commit

Permalink
MAINT: reorder process_input stuff; work w/ BIOMs
Browse files Browse the repository at this point in the history
Turns out that we need to delay matching until after filtering
out unextreme features in order to avoid computing the intersection
of two super-huge indices (EMP has on the order of 200k observations
and trying to match those up is going to be horrible).

And it seems like _df_utils.remove_empty_samples() was super slow
on even SparseDataFrames. To make things easier -- and because I
know from experience that it worked, and let Qurro handle huge
EMP-scale amounts of data -- I reorganized things so that we
first filter out ranks and empty samples, THEN do matching.

This works, but it breaks a lot of the unit tests (and some of
the integration tests that rely on specific error messages that
are now changed). Need to fix these then double check that everything
works properly.

This is a prereq for #58. I have an alg for that sketched out, also.
  • Loading branch information
fedarko committed Jul 2, 2019
1 parent 12c5c4a commit a7f4b7d
Show file tree
Hide file tree
Showing 6 changed files with 104 additions and 90 deletions.
62 changes: 27 additions & 35 deletions qurro/_df_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -145,46 +145,39 @@ def biom_table_to_sparse_df(table, min_row_ct=2, min_col_ct=1):
return table_sdf


def remove_empty_samples(table_sdf, sample_metadata_df):
"""Removes samples with 0 counts for every feature from the table and
sample metadata DataFrame.
This should be called *after* matching the table with the sample
metadata -- we assume that the columns of the table DataFrame are
equivalent to the indices of the sample metadata DataFrame.
This will raise a ValueError if, after removing empty samples, either
the table's columns or the metadata's indices are empty (this will
happen in the case where all of the samples in these DataFrames are
empty).
def remove_empty_samples(biom_table):
"""Removes samples with 0 counts for every feature from a BIOM table.
This will raise a ValueError if, after removing empty samples, the
table's columns are empty (this will happen if all of the samples in
the table are empty).
"""
logging.debug("Attempting to remove empty samples.")
table_df_equal_to_zero = table_sdf == 0
nonempty_samples = []
for sample in table_sdf.columns:
if not table_df_equal_to_zero[sample].all():
nonempty_samples.append(sample)

filtered_table = table_sdf.filter(items=nonempty_samples, axis="columns")
filtered_metadata = sample_metadata_df.filter(
items=nonempty_samples, axis="index"
)
filtered_table = biom_table.remove_empty(axis="sample", inplace=False)

if len(filtered_table.columns) < 1 or len(filtered_metadata.index) < 1:
if filtered_table.shape[1] < 1:
raise ValueError("Found all empty samples with current features.")

sample_diff = len(table_sdf.columns) - len(filtered_table.columns)
sample_diff = filtered_table.shape[1] - biom_table.shape[1]
if sample_diff > 0:
logging.debug("Removed {} empty sample(s).".format(sample_diff))
else:
logging.debug("Couldn't find any empty samples.")
logging.debug("Couldn't find any empty samples to remove.")

return filtered_table, filtered_metadata
return filtered_table


def match_table_and_data(table, feature_ranks, sample_metadata):
"""Matches feature rankings and then sample metadata to a table.
This should bring us to a point where every specified feature/sample is
supported in the output table DataFrame.
Note that the input table here might contain features or samples that
are not included in feature_ranks or sample_metadata, respectively --
this is totally fine. However, errors may be raised if the opposite is
true; see the "Raises" section below for details.
Parameters
----------
Expand Down Expand Up @@ -225,12 +218,11 @@ def match_table_and_data(table, feature_ranks, sample_metadata):
in the table, this will raise a ValueError.
"""
logging.debug("Starting matching table with feature/sample data.")
# Match features to BIOM table, and then match samples to BIOM table.
# This should bring us to a point where every feature/sample is
# supported in the BIOM table. (Note that the input BIOM table might
# contain features or samples that are not included in feature_ranks or
# sample_metadata, respectively -- this is totally fine. The opposite,
# though, is a big no-no.)
# NOTE: if we actually did filtering in filter_unextreme_features, then
# this is an unnecessary step. TODO: make note of this and avoid this
# unnecessary operation in that case?
logging.debug("Starting matching table with feature rankings.")

featurefiltered_table, m_feature_ranks = matchdf(table, feature_ranks)
logging.debug("Matching table with feature ranks done.")
# Ensure that every ranked feature was present in the BIOM table. Raise an
Expand All @@ -244,8 +236,8 @@ def match_table_and_data(table, feature_ranks, sample_metadata):
if unsupported_feature_ct == 1:
word = "was"
raise ValueError(
"Of the {} ranked features, {} {} not present in "
"the input BIOM table.".format(
"Of {} ranked features, {} {} not present in "
"the BIOM table.".format(
feature_ranks.shape[0], unsupported_feature_ct, word
)
)
Expand All @@ -260,7 +252,7 @@ def match_table_and_data(table, feature_ranks, sample_metadata):
if m_sample_metadata.shape[0] < 1:
raise ValueError(
"None of the samples in the sample metadata file "
"are present in the input BIOM table."
"are present in the BIOM table."
)

dropped_sample_ct = sample_metadata.shape[0] - m_sample_metadata.shape[0]
Expand Down
37 changes: 29 additions & 8 deletions qurro/_rank_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
# ----------------------------------------------------------------------------

import logging
import biom
import skbio
import pandas as pd
from qurro._df_utils import escape_columns
Expand Down Expand Up @@ -89,7 +90,7 @@ def differentials_to_df(differentials_loc):


def filter_unextreme_features(
table: pd.SparseDataFrame,
table: biom.Table,
ranks: pd.DataFrame,
extreme_feature_count: int,
print_warning: bool = True,
Expand All @@ -99,10 +100,10 @@ def filter_unextreme_features(
Parameters
----------
table: pd.SparseDataFrame
A SparseDataFrame representation of a BIOM table. This can be
generated easily from a biom.Table object using
qurro._df_utils.biom_table_to_sparse_df().
table: biom.Table
A BIOM table for the dataset.
This checks to make sure that the remaining "extreme" features are
all in the table -- if not, then this throws a ValueError.
ranks: pandas.DataFrame
A DataFrame where the index consists of ranked features' IDs, and
Expand All @@ -123,8 +124,8 @@ def filter_unextreme_features(
Returns
-------
(table, ranks): (pandas.SparseDataFrame, pandas.DataFrame)
Filtered copies of the input table and ranks DataFrames.
(table, ranks): (biom.Table, pandas.DataFrame)
Filtered copies of the input BIOM table and feature ranking DF.
Behavior
--------
Expand Down Expand Up @@ -182,6 +183,7 @@ def filter_unextreme_features(
)
logging.debug("Input table has shape {}.".format(table.shape))
logging.debug("Input feature ranks have shape {}.".format(ranks.shape))

# We store these features in a set to avoid duplicates -- Python does the
# hard work here for us
features_to_preserve = set()
Expand All @@ -193,7 +195,26 @@ def filter_unextreme_features(

# Also filter ranks. Fortunately, DataFrame.filter() makes this easy.
filtered_ranks = ranks.filter(items=features_to_preserve, axis="index")
filtered_table = table.filter(items=features_to_preserve, axis="index")

# Filter the BIOM table to desired features.
def filter_biom_table(values, feature_id, _):
return feature_id in features_to_preserve

filtered_table = table.filter(
filter_biom_table, axis="observation", inplace=False
)

# Since Qurro filters unextreme features before matching the table with the
# feature ranks, there's the possibility that all of the features that we
# filtered the table to are not actually *present* in the table. So we need
# to quickly verify that the table contains all of the "extreme" features.
table_feature_ct = filtered_table.shape[0]
ranks_feature_ct = len(filtered_ranks.index)
if table_feature_ct < ranks_feature_ct:
raise ValueError(
'{} "extreme" ranked feature(s) were not present in '
"the input BIOM table.".format(ranks_feature_ct - table_feature_ct)
)

logging.debug("Output table has shape {}.".format(filtered_table.shape))
logging.debug(
Expand Down
24 changes: 11 additions & 13 deletions qurro/generate.py
Original file line number Diff line number Diff line change
Expand Up @@ -130,36 +130,34 @@ def process_input(
if feature_metadata is not None:
feature_metadata = replace_nan(feature_metadata)

table = biom_table_to_sparse_df(biom_table)

# Match up the table with the feature ranks and sample metadata.
m_table, m_sample_metadata = match_table_and_data(
table, feature_ranks, sample_metadata
)

# Note that although we always call filter_unextreme_features(), filtering
# isn't necessarily always done (whether or not depends on the value of
# extreme_feature_count and the contents of the table/ranks).
filtered_table, filtered_ranks = filter_unextreme_features(
m_table, feature_ranks, extreme_feature_count
filtered_biom_table, filtered_ranks = filter_unextreme_features(
biom_table, feature_ranks, extreme_feature_count
)

# Filter now-empty samples from the BIOM table.
output_table, output_metadata = remove_empty_samples(
filtered_table, m_sample_metadata
filtered_biom_table = remove_empty_samples(filtered_biom_table)

unmatched_table = biom_table_to_sparse_df(filtered_biom_table)

# Match up the table with the feature ranks and sample metadata.
output_table, output_metadata = match_table_and_data(
unmatched_table, filtered_ranks, sample_metadata
)

# Save a list of ranking IDs (before we add in feature metadata)
ranking_ids = filtered_ranks.columns

filtered_ranks, feature_metadata_cols = merge_feature_metadata(
output_ranks, feature_metadata_cols = merge_feature_metadata(
filtered_ranks, feature_metadata
)

logging.debug("Finished input processing.")
return (
output_metadata,
filtered_ranks,
output_ranks,
ranking_ids,
feature_metadata_cols,
output_table,
Expand Down
Binary file added qurro/tests/input/moving_pictures/biplot.qzv
Binary file not shown.
Binary file added qurro/tests/input/moving_pictures/qurro-plot.qzv
Binary file not shown.
71 changes: 37 additions & 34 deletions qurro/tests/test_filter_unextreme_features.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
import biom
from numpy import arange
import numpy as np
from numpy.testing import assert_array_equal
from pandas import DataFrame
from pandas.testing import assert_frame_equal
import pytest
from qurro._rank_utils import filter_unextreme_features
from qurro.generate import biom_table_to_sparse_df


def get_test_data():
Expand All @@ -22,58 +22,61 @@ def get_test_data():
# Based on the BIOM docs' example of initialization using a np ndarray --
# http://biom-format.org/documentation/table_objects.html#examples
#
# arange(40) generates a numpy ndarray that just goes from 0 to 39 (i.e.
# np.arange(40) generates a numpy ndarray that just goes from 0 to 39 (i.e.
# contains 40 numbers). We reshape this ndarray to give it a sort of
# "tabular" structure (a 2-D array containing 8 arrays, each with 5
# numbers).
underlying_table_data = arange(40).reshape(8, 5)
underlying_table_data = np.arange(40).reshape(8, 5)
# Set the third sample in the data to contain all zeros, except for a
# count for F4 (so we can test what this function does with so-called
# "empty" samples after filtering out F4).
underlying_table_data[:, 2] = 0.0
underlying_table_data[3, 2] = 1.0
# Finally, use the data to create a BIOM table object.
biom_table = biom.Table(underlying_table_data, feature_ids, sample_ids)
# ...And yeah we're actually making it into a Sparse DF because that's what
# I changed filter_unextreme_features() to expect now.
# (TODO: simplify this code in the future?)
output_table = biom_table_to_sparse_df(biom_table)

return output_table, ranks
return biom_table, ranks


def test_filtering_basic():
"""Tests the standard behavior of filter_unextreme_features()."""

table, ranks = get_test_data()
biom_table, ranks = get_test_data()
filtered_table, filtered_ranks = filter_unextreme_features(
table, ranks, 2, print_warning=False
biom_table, ranks, 2, print_warning=False
)
# Check that the appropriate features/samples were filtered out of the
# table. NOTE -- I know this is sloppy code. Would like to fix it in the
# future.
for fid in ["F1", "F2", "F7", "F8"]:
assert fid in filtered_table.index
assert filtered_table.exists(fid, axis="observation")
for fid in ["F3", "F4", "F5", "F6"]:
assert fid not in filtered_table.index
assert not filtered_table.exists(fid, axis="observation")
# Check that all samples were preserved.
# (The removal of empty features is done *after*
# filter_unextreme_features() is called in normal Qurro execution, so we
# should expect all samples -- even empty ones -- to remain here.
for sid in ["S1", "S2", "S3", "S4", "S5"]:
assert sid in filtered_table.columns

assert filtered_table.exists(sid, axis="sample")
# Check that the appropriate data is left in the table.
assert list(filtered_table.loc["F1"]) == [0, 1, 0, 3, 4]
assert list(filtered_table.loc["F2"]) == [5, 6, 0, 8, 9]
assert list(filtered_table.loc["F7"]) == [30, 31, 0, 33, 34]
assert list(filtered_table.loc["F8"]) == [35, 36, 0, 38, 39]
assert_array_equal(
filtered_table.data("F1", axis="observation"), [0, 1, 0, 3, 4]
)
assert_array_equal(
filtered_table.data("F2", axis="observation"), [5, 6, 0, 8, 9]
)
assert_array_equal(
filtered_table.data("F7", axis="observation"), [30, 31, 0, 33, 34]
)
assert_array_equal(
filtered_table.data("F8", axis="observation"), [35, 36, 0, 38, 39]
)

# Check that the rank filtering worked as expected.
expected_filtered_ranks = DataFrame(
{"Rank 0": [1, 2, 7, 8], "Rank 1": [8, 7, 2, 1]},
index=["F1", "F2", "F7", "F8"],
)
# Check that the rank filtering worked as expected.
assert_frame_equal(
filtered_ranks, expected_filtered_ranks, check_like=True
)
Expand All @@ -84,19 +87,19 @@ def test_filtering_large_efc():
is greater than or equal to the number of ranked features.
"""

table, ranks = get_test_data()
biom_table, ranks = get_test_data()

# The number of ranked features is 8.
filtered_table, filtered_ranks = filter_unextreme_features(
table, ranks, 4, print_warning=False
biom_table, ranks, 4, print_warning=False
)
assert_frame_equal(table, filtered_table)
assert biom_table == filtered_table
assert_frame_equal(ranks, filtered_ranks)

filtered_table, filtered_ranks = filter_unextreme_features(
table, ranks, 8, print_warning=False
biom_table, ranks, 8, print_warning=False
)
assert_frame_equal(table, filtered_table)
assert biom_table == filtered_table
assert_frame_equal(ranks, filtered_ranks)


Expand All @@ -106,12 +109,12 @@ def test_filtering_no_efc():
done).
"""

table, ranks = get_test_data()
biom_table, ranks = get_test_data()

filtered_table, filtered_ranks = filter_unextreme_features(
table, ranks, None, print_warning=False
biom_table, ranks, None, print_warning=False
)
assert_frame_equal(table, filtered_table)
assert biom_table == filtered_table
assert_frame_equal(ranks, filtered_ranks)


Expand All @@ -120,19 +123,19 @@ def test_filtering_invalid_efc():
extreme feature count is less than 1 and/or not an integer.
"""

table, ranks = get_test_data()
biom_table, ranks = get_test_data()

with pytest.raises(ValueError):
filter_unextreme_features(table, ranks, 0)
filter_unextreme_features(biom_table, ranks, 0)

with pytest.raises(ValueError):
filter_unextreme_features(table, ranks, -1)
filter_unextreme_features(biom_table, ranks, -1)

with pytest.raises(ValueError):
filter_unextreme_features(table, ranks, -2)
filter_unextreme_features(biom_table, ranks, -2)

with pytest.raises(ValueError):
filter_unextreme_features(table, ranks, 1.5)
filter_unextreme_features(biom_table, ranks, 1.5)

with pytest.raises(ValueError):
filter_unextreme_features(table, ranks, 5.5)
filter_unextreme_features(biom_table, ranks, 5.5)

0 comments on commit a7f4b7d

Please sign in to comment.