From b8c90babb0e0527507fbfb8674f941be87241cf4 Mon Sep 17 00:00:00 2001 From: Marcus Fedarko Date: Wed, 3 Jul 2019 16:38:41 -0700 Subject: [PATCH] ENH: Use DF.align instead of matchdf: close #172! Turns out this is a ton more efficient. Added bonus of now relying on pandas' implementation of this instead of ours. Turns out transposing huge dataframes is a pretty significant endeavor, so calling .T on the feature table for like the EMP dataset was taking a super long time. Fortunately, we can finesse our way around this by instead transposing the sample metadata and then aligning on the columns. I'm glad that we reached a solution for this that preserved all of the matching-up-front niceness re: testing. Solid stuff. Uh, next up are #171 and then #58? But we can def merge this back into master now. --- qurro/_df_utils.py | 53 ++++++++++++++------------- qurro/tests/test_df_utils.py | 70 ------------------------------------ 2 files changed, 28 insertions(+), 95 deletions(-) diff --git a/qurro/_df_utils.py b/qurro/_df_utils.py index 6539b845..0b08af6a 100644 --- a/qurro/_df_utils.py +++ b/qurro/_df_utils.py @@ -11,16 +11,6 @@ import pandas as pd -def matchdf(df1, df2): - """Filters both DataFrames to just the rows of their shared indices. - - Derived from gneiss.util.match() (https://github.com/biocore/gneiss). - """ - - idx = set(df1.index) & set(df2.index) - return df1.loc[idx], df2.loc[idx] - - def ensure_df_headers_unique(df, df_name): """Raises an error if the index or columns of the DataFrame aren't unique. @@ -193,6 +183,14 @@ def match_table_and_data(table, feature_ranks, sample_metadata): should correspond to observations (i.e. features), and the columns should correspond to samples. + Note that the input BIOM table might contain features or samples + that are not included in feature_ranks or sample_metadata, + respectively -- this is totally fine. The opposite, though, is + where things get to be a problem: if any of the features in + feature_ranks are not present in the table, or if all of the + samples in sample_metadata are not in the table, then this will + raise errors. + feature_ranks: pd.DataFrame A DataFrame describing features' "ranks" along ranking(s). The index of this DataFrame should correspond to feature IDs, and the @@ -224,14 +222,10 @@ def match_table_and_data(table, feature_ranks, sample_metadata): If all of the samples described in sample_metadata are not present in the table, this will raise a ValueError. """ - logging.debug("Starting matching table with feature/sample data.") - # Match features to BIOM table, and then match samples to BIOM table. - # This should bring us to a point where every feature/sample is - # supported in the BIOM table. (Note that the input BIOM table might - # contain features or samples that are not included in feature_ranks or - # sample_metadata, respectively -- this is totally fine. The opposite, - # though, is a big no-no.) - featurefiltered_table, m_feature_ranks = matchdf(table, feature_ranks) + logging.debug("Starting matching table with feature rankings.") + featurefiltered_table, m_feature_ranks = table.align( + feature_ranks, axis="index", join="inner" + ) logging.debug("Matching table with feature ranks done.") # Ensure that every ranked feature was present in the BIOM table. Raise an # error if this isn't the case. @@ -250,11 +244,23 @@ def match_table_and_data(table, feature_ranks, sample_metadata): ) ) - logging.debug("Starting matching table with sample metadata.") - m_table_transpose, m_sample_metadata = matchdf( - featurefiltered_table.T, sample_metadata + # We transpose the sample metadata instead of the actual table because + # transposing in pandas, at least from some personal testing, can be really + # expensive for huge (EMP-scale) DataFrames. Since sample metadata will + # generally be smaller than the actual table, we transpose that. + logging.debug( + "Temporarily transposing sample metadata to make matching easier." + ) + sample_metadata_transposed = sample_metadata.T + logging.debug("Transposing done.") + logging.debug("Starting matching table with (tranposed) sample metadata.") + m_table, m_sample_metadata_transposed = featurefiltered_table.align( + sample_metadata_transposed, axis="columns", join="inner" ) logging.debug("Matching table with sample metadata done.") + logging.debug("Transposing sample metadata again to reset it.") + m_sample_metadata = m_sample_metadata_transposed.T + logging.debug("Transposing done.") # Allow for dropped samples (e.g. negative controls), but ensure that at # least one sample is supported by the BIOM table. if m_sample_metadata.shape[0] < 1: @@ -270,10 +276,7 @@ def match_table_and_data(table, feature_ranks, sample_metadata): "present in the BIOM table, and have been removed from the " "visualization.".format(dropped_sample_ct) ) - # We return the transpose of the transposed table, so the table should have - # the same "orientation" (i.e. columns are samples, rows (indices) are - # features) as the input table. - return m_table_transpose.T, m_sample_metadata + return m_table, m_sample_metadata def merge_feature_metadata(feature_ranks, feature_metadata=None): diff --git a/qurro/tests/test_df_utils.py b/qurro/tests/test_df_utils.py index 0779e348..bc1a09df 100644 --- a/qurro/tests/test_df_utils.py +++ b/qurro/tests/test_df_utils.py @@ -3,7 +3,6 @@ from pandas.testing import assert_frame_equal, assert_series_equal import numpy as np from qurro._df_utils import ( - matchdf, ensure_df_headers_unique, validate_df, replace_nan, @@ -11,75 +10,6 @@ ) -def test_matchdf(): - """Tests the matchdf() function.""" - - df1 = DataFrame( - { - "col1": [1, 2, 3, 4, 5], - "col2": [6, 7, 8, 9, 10], - "col3": [11, 12, 13, 14, 15], - }, - index=["a", "b", "c", "d", "e"], - ) - df2 = DataFrame( - { - "colA": [5, 4, 3, 2, 1], - "colB": [10, 9, 8, 7, 6], - "colC": [15, 14, 13, 12, 11], - "colD": ["q", "w", "e", "r", "t"], - }, - index=["a", "c", "d", "x", "y"], - ) - df3 = DataFrame(index=["a", "x"]) - df4 = DataFrame(index=["x"]) - - # The ground truth DF from matching dfX with dfY is named dfXY - df12 = DataFrame( - {"col1": [1, 3, 4], "col2": [6, 8, 9], "col3": [11, 13, 14]}, - index=["a", "c", "d"], - ) - df21 = DataFrame( - { - "colA": [5, 4, 3], - "colB": [10, 9, 8], - "colC": [15, 14, 13], - "colD": ["q", "w", "e"], - }, - index=["a", "c", "d"], - ) - df13 = DataFrame({"col1": [1], "col2": [6], "col3": [11]}, index=["a"]) - df31 = DataFrame(index=["a"]) - # we need to specify a dtype of "int64" here because pandas, by default, - # infers that df14's dtype is just "object"; however, the result of - # matching df1 and df4 will have an "int64" dtype (since df1 already has - # an inferred "int64" dtype). - df14 = DataFrame(columns=["col1", "col2", "col3"]).astype("int64") - df41 = DataFrame() - - # Basic testing: ensure that matching results match up with the ground - # truths - A, B = matchdf(df1, df2) - assert_frame_equal(A, df12, check_like=True) - assert_frame_equal(B, df21, check_like=True) - - # Test "commutativity" of matchdf() -- reversing the DFs' orders shouldn't - # change the matching results (aside from the output order, of course) - A, B = matchdf(df2, df1) - assert_frame_equal(A, df21, check_like=True) - assert_frame_equal(B, df12, check_like=True) - - # Test that matching with empty DFs works as expected - # First, try matching in the case where at least one index name matches - A, B = matchdf(df1, df3) - assert_frame_equal(A, df13, check_like=True) - assert_frame_equal(B, df31, check_like=True) - # Next, try matching in the case where there's no overlap in index names - A, B = matchdf(df1, df4) - assert_frame_equal(A, df14, check_like=True) - assert_frame_equal(B, df41, check_like=True) - - def test_ensure_df_headers_unique(): """Tests the ensure_df_headers_unique() function in generate.py."""