From cea10372507017a7ea683a4db48b0a2955454fbb Mon Sep 17 00:00:00 2001 From: Marcus Fedarko Date: Sat, 6 Jul 2019 17:35:33 -0700 Subject: [PATCH] ENH: Add sparsify_count_dict() and unit tests #58 Need to actually integrate this with Qurro, but close! --- qurro/_df_utils.py | 26 ++++++++++++++++++++++++++ qurro/tests/test_df_utils.py | 30 ++++++++++++++++++++++++++++++ 2 files changed, 56 insertions(+) diff --git a/qurro/_df_utils.py b/qurro/_df_utils.py index 2c374c77..b5ccd7fd 100644 --- a/qurro/_df_utils.py +++ b/qurro/_df_utils.py @@ -372,3 +372,29 @@ def merge_feature_metadata(feature_ranks, feature_metadata=None): output_feature_data = feature_ranks return output_feature_data, feature_metadata_cols + + +def sparsify_count_dict(count_dict): + """Returns a "sparse" representation of a dict of counts data. + + We expect that the input dict is of the format {feature ID: {sample ID: + count, sample 2 ID: count, ...}, ...}. In theory you could also totally + pass in a transposed dict here (where samples are the "outer layer" of + the dict), but the variable names would need to be flipped to make this + function make sense. (See #175 on GitHub for context.) + + Anyway, this function returns the input dict, but without references to + 0-count samples for a given feature. (Since we filter out empty + features, we expect that every feature should have at least one sample + with a nonzero count of that feature.) + """ + sparse_count_dict = {} + for feature_id, sample_counts in count_dict.items(): + # This will be the new sample_counts dict for this feature ID, but only + # containing sample IDs with nonzero counts. + fdict = {} + for sample_id, count in sample_counts.items(): + if count != 0: + fdict[sample_id] = count + sparse_count_dict[feature_id] = fdict + return sparse_count_dict diff --git a/qurro/tests/test_df_utils.py b/qurro/tests/test_df_utils.py index 7b5edc04..21df5871 100644 --- a/qurro/tests/test_df_utils.py +++ b/qurro/tests/test_df_utils.py @@ -8,6 +8,7 @@ replace_nan, remove_empty_samples_and_features, merge_feature_metadata, + sparsify_count_dict, ) @@ -397,3 +398,32 @@ def test_merge_feature_metadata(): fm.columns = ["FM1", "R2", "FM3"] with pytest.raises(ValueError): merge_feature_metadata(ranks, fm) + + +def test_sparsify_count_dict(): + + # Test that it works in basic case + test_cts = { + "Feature 1": {"Sample 1": 0, "Sample 2": 3, "Sample 3": 0}, + "Feature 2": {"Sample 1": 2, "Sample 2": 4, "Sample 3": 0}, + } + assert sparsify_count_dict(test_cts) == { + "Feature 1": {"Sample 2": 3}, + "Feature 2": {"Sample 1": 2, "Sample 2": 4}, + } + + # Test that it works even when the data is inherently dense (i.e. no zeros) + test_cts = { + "Feature 1": {"Sample 1": 1, "Sample 2": 3, "Sample 3": 5}, + "Feature 2": {"Sample 1": 2, "Sample 2": 4, "Sample 3": 6}, + } + assert sparsify_count_dict(test_cts) == test_cts + + # Test that it works even when the data is totally sparse + # This should never happen, since we filter out empty features, but good to + # be sure + test_cts = { + "Feature 1": {"Sample 1": 0, "Sample 2": 0, "Sample 3": 0}, + "Feature 2": {"Sample 1": 0, "Sample 2": 0, "Sample 3": 0}, + } + assert sparsify_count_dict(test_cts) == {"Feature 1": {}, "Feature 2": {}}