Skip to content

Commit

Permalink
MAINT: Remove empty features #171
Browse files Browse the repository at this point in the history
This is gonna be kind of inconvient to apply to the feature
ranks, also. Think I'm going to go back and try an alternative approach
to all of this stuff (#172, #171, #58) on another branch.
  • Loading branch information
fedarko committed Jul 3, 2019
1 parent 918a340 commit 5e9977a
Show file tree
Hide file tree
Showing 3 changed files with 33 additions and 23 deletions.
28 changes: 19 additions & 9 deletions qurro/_df_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -145,24 +145,34 @@ def biom_table_to_sparse_df(table, min_row_ct=2, min_col_ct=1):
return table_sdf


def remove_empty_samples(biom_table):
"""Removes samples with 0 counts for every feature from a BIOM table.
def remove_empty(biom_table):
"""Removes "empty" samples and features from a BIOM table.
This will raise a ValueError if, after removing empty samples, the
table's columns are empty (this will happen if all of the samples in
the table are empty).
"""
logging.debug("Attempting to remove empty samples.")
filtered_table = biom_table.remove_empty(axis="sample", inplace=False)
# TODO make this also somehow alter the ranks so that these are removed
# from there? (so matching doesn't fail?)
# And while we're at it we should maybe also do the same thing for sample
# metadata tbh.
logging.debug("Attempting to remove empty samples and features.")
filtered_table = biom_table.remove_empty(inplace=False)

if filtered_table.shape[0] < 1:
raise ValueError("All of the current features in the table are empty.")

if filtered_table.shape[1] < 1:
raise ValueError("Found all empty samples with current features.")

sample_diff = filtered_table.shape[1] - biom_table.shape[1]
if sample_diff > 0:
logging.debug("Removed {} empty sample(s).".format(sample_diff))
else:
logging.debug("Couldn't find any empty samples to remove.")
for name, idx in (("feature", 0), ("sample", 1)):
diff = filtered_table.shape[idx] - biom_table.shape[idx]
if diff > 0:
logging.debug("Removed {} empty {}(s).".format(name, diff))
else:
logging.debug(
"Couldn't find any empty {}s to remove.".format(name)
)

return filtered_table

Expand Down
8 changes: 4 additions & 4 deletions qurro/generate.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@
replace_nan,
validate_df,
biom_table_to_sparse_df,
remove_empty_samples,
remove_empty,
match_table_and_data,
merge_feature_metadata,
)
Expand Down Expand Up @@ -75,8 +75,8 @@ def process_input(
using the provided extreme_feature_count. (If it's None, then nothing
will be done.)
4. Calls remove_empty_samples() to filter samples without any counts for
any features from the BIOM table. This is purposefully done *after*
4. Calls remove_empty() to filter empty samples and features from the
BIOM table. This is purposefully done *after*
filter_unextreme_features() is called.
5. Converts the BIOM table to a SparseDataFrame by calling
Expand Down Expand Up @@ -145,7 +145,7 @@ def process_input(
)

# 4. Filter (now-)empty samples from the BIOM table.
filtered_biom_table = remove_empty_samples(feature_filtered_biom_table)
filtered_biom_table = remove_empty(feature_filtered_biom_table)

# 5. Convert the BIOM table to a SparseDataFrame.
unmatched_table = biom_table_to_sparse_df(filtered_biom_table)
Expand Down
20 changes: 10 additions & 10 deletions qurro/tests/test_df_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
ensure_df_headers_unique,
validate_df,
replace_nan,
remove_empty_samples,
remove_empty,
)


Expand Down Expand Up @@ -241,7 +241,7 @@ def test_replace_nan():


def get_test_data():
"""Returns test data for the remove_empty_samples() tests."""
"""Returns test data for the remove_empty() tests."""
feature_ids = ["F1", "F2", "F3", "F4", "F5", "F6", "F7", "F8"]
sample_ids = ["Sample1", "Sample2", "Sample3", "Sample4"]
table_data = np.array(
Expand All @@ -255,24 +255,24 @@ def get_test_data():
return table_data, feature_ids, sample_ids


def test_remove_empty_samples_basic():
"""Tests remove_empty_samples() in the simple cases of removing 0, 1, and 2
def test_remove_empty_basic():
"""Tests remove_empty() in the simple cases of removing 0, 1, and 2
empty sample(s).
"""

# TRY REMOVING 0 SAMPLES
data, fids, sids = get_test_data()
table = biom.Table(data, fids, sids)
# Check that, when none of the samples are empty, nothing is changed.
ftable = remove_empty_samples(table).to_dataframe()
ftable = remove_empty(table).to_dataframe()
assert_frame_equal(ftable, table.to_dataframe())

# TRY REMOVING 1 SAMPLE
# Zero out Sample3 (it only has one count, for F1)
data[0][2] = 0
table = biom.Table(data, fids, sids)
# Check that just the one empty sample (Sample3) was removed.
ftable = remove_empty_samples(table).to_dataframe()
ftable = remove_empty(table).to_dataframe()
assert_array_equal(ftable["Sample1"], data[:, 0])
assert_array_equal(ftable["Sample2"], data[:, 1])
assert_array_equal(ftable["Sample4"], data[:, 3])
Expand All @@ -284,7 +284,7 @@ def test_remove_empty_samples_basic():
# Now, zero out Sample4 (it only has one count in F4)
data[3][3] = 0
table = biom.Table(data, fids, sids)
ftable = remove_empty_samples(table).to_dataframe()
ftable = remove_empty(table).to_dataframe()
assert_array_equal(ftable["Sample1"], data[:, 0])
assert_array_equal(ftable["Sample2"], data[:, 1])
assert "Sample3" not in ftable.columns
Expand All @@ -293,10 +293,10 @@ def test_remove_empty_samples_basic():
assert len(ftable.index) == len(fids) == 8


def test_remove_empty_samples_allempty():
"""Tests remove_empty_samples() when all samples in the table are empty."""
def test_remove_empty_allempty():
"""Tests remove_empty() when all samples in the table are empty."""

_, feature_ids, sample_ids = get_test_data()
table = biom.Table(np.zeros(32).reshape(8, 4), feature_ids, sample_ids)
with pytest.raises(ValueError):
remove_empty_samples(table)
remove_empty(table)

0 comments on commit 5e9977a

Please sign in to comment.