Skip to content

Commit

Permalink
ENH: Add sparsify_count_dict() and unit tests #58
Browse files Browse the repository at this point in the history
Need to actually integrate this with Qurro, but close!
  • Loading branch information
fedarko committed Jul 7, 2019
1 parent d68deed commit cea1037
Show file tree
Hide file tree
Showing 2 changed files with 56 additions and 0 deletions.
26 changes: 26 additions & 0 deletions qurro/_df_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -372,3 +372,29 @@ def merge_feature_metadata(feature_ranks, feature_metadata=None):
output_feature_data = feature_ranks

return output_feature_data, feature_metadata_cols


def sparsify_count_dict(count_dict):
"""Returns a "sparse" representation of a dict of counts data.
We expect that the input dict is of the format {feature ID: {sample ID:
count, sample 2 ID: count, ...}, ...}. In theory you could also totally
pass in a transposed dict here (where samples are the "outer layer" of
the dict), but the variable names would need to be flipped to make this
function make sense. (See #175 on GitHub for context.)
Anyway, this function returns the input dict, but without references to
0-count samples for a given feature. (Since we filter out empty
features, we expect that every feature should have at least one sample
with a nonzero count of that feature.)
"""
sparse_count_dict = {}
for feature_id, sample_counts in count_dict.items():
# This will be the new sample_counts dict for this feature ID, but only
# containing sample IDs with nonzero counts.
fdict = {}
for sample_id, count in sample_counts.items():
if count != 0:
fdict[sample_id] = count
sparse_count_dict[feature_id] = fdict
return sparse_count_dict
30 changes: 30 additions & 0 deletions qurro/tests/test_df_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
replace_nan,
remove_empty_samples_and_features,
merge_feature_metadata,
sparsify_count_dict,
)


Expand Down Expand Up @@ -397,3 +398,32 @@ def test_merge_feature_metadata():
fm.columns = ["FM1", "R2", "FM3"]
with pytest.raises(ValueError):
merge_feature_metadata(ranks, fm)


def test_sparsify_count_dict():

# Test that it works in basic case
test_cts = {
"Feature 1": {"Sample 1": 0, "Sample 2": 3, "Sample 3": 0},
"Feature 2": {"Sample 1": 2, "Sample 2": 4, "Sample 3": 0},
}
assert sparsify_count_dict(test_cts) == {
"Feature 1": {"Sample 2": 3},
"Feature 2": {"Sample 1": 2, "Sample 2": 4},
}

# Test that it works even when the data is inherently dense (i.e. no zeros)
test_cts = {
"Feature 1": {"Sample 1": 1, "Sample 2": 3, "Sample 3": 5},
"Feature 2": {"Sample 1": 2, "Sample 2": 4, "Sample 3": 6},
}
assert sparsify_count_dict(test_cts) == test_cts

# Test that it works even when the data is totally sparse
# This should never happen, since we filter out empty features, but good to
# be sure
test_cts = {
"Feature 1": {"Sample 1": 0, "Sample 2": 0, "Sample 3": 0},
"Feature 2": {"Sample 1": 0, "Sample 2": 0, "Sample 3": 0},
}
assert sparsify_count_dict(test_cts) == {"Feature 1": {}, "Feature 2": {}}

0 comments on commit cea1037

Please sign in to comment.