From cea10372507017a7ea683a4db48b0a2955454fbb Mon Sep 17 00:00:00 2001
From: Marcus Fedarko <mfedarko@ucsd.edu>
Date: Sat, 6 Jul 2019 17:35:33 -0700
Subject: [PATCH] ENH: Add sparsify_count_dict() and unit tests #58

Need to actually integrate this with Qurro, but close!
---
 qurro/_df_utils.py           | 26 ++++++++++++++++++++++++++
 qurro/tests/test_df_utils.py | 30 ++++++++++++++++++++++++++++++
 2 files changed, 56 insertions(+)

diff --git a/qurro/_df_utils.py b/qurro/_df_utils.py
index 2c374c77..b5ccd7fd 100644
--- a/qurro/_df_utils.py
+++ b/qurro/_df_utils.py
@@ -372,3 +372,29 @@ def merge_feature_metadata(feature_ranks, feature_metadata=None):
         output_feature_data = feature_ranks
 
     return output_feature_data, feature_metadata_cols
+
+
+def sparsify_count_dict(count_dict):
+    """Returns a "sparse" representation of a dict of counts data.
+
+       We expect that the input dict is of the format {feature ID: {sample ID:
+       count, sample 2 ID: count, ...}, ...}. In theory you could also totally
+       pass in a transposed dict here (where samples are the "outer layer" of
+       the dict), but the variable names would need to be flipped to make this
+       function make sense. (See #175 on GitHub for context.)
+
+       Anyway, this function returns the input dict, but without references to
+       0-count samples for a given feature. (Since we filter out empty
+       features, we expect that every feature should have at least one sample
+       with a nonzero count of that feature.)
+    """
+    sparse_count_dict = {}
+    for feature_id, sample_counts in count_dict.items():
+        # This will be the new sample_counts dict for this feature ID, but only
+        # containing sample IDs with nonzero counts.
+        fdict = {}
+        for sample_id, count in sample_counts.items():
+            if count != 0:
+                fdict[sample_id] = count
+        sparse_count_dict[feature_id] = fdict
+    return sparse_count_dict
diff --git a/qurro/tests/test_df_utils.py b/qurro/tests/test_df_utils.py
index 7b5edc04..21df5871 100644
--- a/qurro/tests/test_df_utils.py
+++ b/qurro/tests/test_df_utils.py
@@ -8,6 +8,7 @@
     replace_nan,
     remove_empty_samples_and_features,
     merge_feature_metadata,
+    sparsify_count_dict,
 )
 
 
@@ -397,3 +398,32 @@ def test_merge_feature_metadata():
     fm.columns = ["FM1", "R2", "FM3"]
     with pytest.raises(ValueError):
         merge_feature_metadata(ranks, fm)
+
+
+def test_sparsify_count_dict():
+
+    # Test that it works in basic case
+    test_cts = {
+        "Feature 1": {"Sample 1": 0, "Sample 2": 3, "Sample 3": 0},
+        "Feature 2": {"Sample 1": 2, "Sample 2": 4, "Sample 3": 0},
+    }
+    assert sparsify_count_dict(test_cts) == {
+        "Feature 1": {"Sample 2": 3},
+        "Feature 2": {"Sample 1": 2, "Sample 2": 4},
+    }
+
+    # Test that it works even when the data is inherently dense (i.e. no zeros)
+    test_cts = {
+        "Feature 1": {"Sample 1": 1, "Sample 2": 3, "Sample 3": 5},
+        "Feature 2": {"Sample 1": 2, "Sample 2": 4, "Sample 3": 6},
+    }
+    assert sparsify_count_dict(test_cts) == test_cts
+
+    # Test that it works even when the data is totally sparse
+    # This should never happen, since we filter out empty features, but good to
+    # be sure
+    test_cts = {
+        "Feature 1": {"Sample 1": 0, "Sample 2": 0, "Sample 3": 0},
+        "Feature 2": {"Sample 1": 0, "Sample 2": 0, "Sample 3": 0},
+    }
+    assert sparsify_count_dict(test_cts) == {"Feature 1": {}, "Feature 2": {}}