From fe40ba53d30439ff4bc3194aea3695dc95723524 Mon Sep 17 00:00:00 2001 From: Christian Diener Date: Thu, 7 Mar 2019 11:51:59 -0800 Subject: [PATCH 1/4] specify fill value --- biom/table.py | 9 +++++---- biom/tests/test_table.py | 9 ++++++++- 2 files changed, 13 insertions(+), 5 deletions(-) diff --git a/biom/table.py b/biom/table.py index 30a86035..f243cd6f 100644 --- a/biom/table.py +++ b/biom/table.py @@ -178,7 +178,7 @@ from copy import deepcopy from datetime import datetime from json import dumps -from functools import reduce +from functools import reduce, partial from operator import itemgetter from future.builtins import zip from future.utils import viewitems @@ -4045,9 +4045,10 @@ def to_dataframe(self, dense=False): mat = self.matrix_data.toarray() constructor = pd.DataFrame else: - mat = [pd.SparseSeries(r.toarray().squeeze()) - for r in self.matrix_data.tocsr()] - constructor = pd.SparseDataFrame + mat = self.matrix_data + constructor = partial(pd.SparseDataFrame, + default_fill_value=0, + copy=True) return constructor(mat, index=index, columns=columns) diff --git a/biom/tests/test_table.py b/biom/tests/test_table.py index a1245892..0503551b 100644 --- a/biom/tests/test_table.py +++ b/biom/tests/test_table.py @@ -1475,10 +1475,17 @@ def test_add_group_metadata_w_existing_metadata(self): def test_to_dataframe(self): exp = pd.SparseDataFrame(np.array([[0.0, 1.0, 2.0], [3.0, 4.0, 5.0]]), index=['O1', 'O2'], - columns=['S1', 'S2', 'S3']) + columns=['S1', 'S2', 'S3'], + default_fill_value = 0.0) obs = example_table.to_dataframe() pdt.assert_frame_equal(obs, exp) + def test_to_dataframe_is_sparse(self): + df = example_table.to_dataframe() + density = (example_table.matrix_data.getnnz() / + np.prod(example_table.shape)) + assert np.allclose(df.density, density) + def test_to_dataframe_dense(self): exp = pd.DataFrame(np.array([[0.0, 1.0, 2.0], [3.0, 4.0, 5.0]]), index=['O1', 'O2'], From bbf66b34c452810bac64a2e2d8f9a589a143aa8e Mon Sep 17 00:00:00 2001 From: Christian Diener Date: Thu, 7 Mar 2019 13:10:33 -0800 Subject: [PATCH 2/4] fix flake8 --- biom/tests/test_table.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/biom/tests/test_table.py b/biom/tests/test_table.py index 0503551b..5a881bdc 100644 --- a/biom/tests/test_table.py +++ b/biom/tests/test_table.py @@ -1476,7 +1476,7 @@ def test_to_dataframe(self): exp = pd.SparseDataFrame(np.array([[0.0, 1.0, 2.0], [3.0, 4.0, 5.0]]), index=['O1', 'O2'], columns=['S1', 'S2', 'S3'], - default_fill_value = 0.0) + default_fill_value=0.0) obs = example_table.to_dataframe() pdt.assert_frame_equal(obs, exp) From 6c11d54e87473700b952123e5d4975aea2ae2407 Mon Sep 17 00:00:00 2001 From: Christian Diener Date: Thu, 7 Mar 2019 13:16:57 -0800 Subject: [PATCH 3/4] add to changelog --- ChangeLog.md | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/ChangeLog.md b/ChangeLog.md index cb27d306..4bfeb922 100644 --- a/ChangeLog.md +++ b/ChangeLog.md @@ -10,21 +10,23 @@ New Features: Bug fixes: +* `Table.to_dataframe(dense=False)` does now correctly produce sparse data frames (and not accidentally dense ones as before) + biom 2.1.7 ---------- New features and bug fixes, released on 28 September 2018. -Important: +Important: * Python 3.4 support has been dropped. We now only support Python 2.7, 3.5, 3.6 and 3.7. * We will be dropping Python 2.7 support on the next release. -* Pandas >= 0.20.0 is now the minimum required version. +* Pandas >= 0.20.0 is now the minimum required version. * pytest is now used instead of nose. New Features: -* Massive performance boost to `Table.collapse` with the default collapse function. The difference was 10s of milliseconds vs. minutes stemming from prior use of `operator.add`. See [issue #761](https://github.com/biocore/biom-format/issues/761). +* Massive performance boost to `Table.collapse` with the default collapse function. The difference was 10s of milliseconds vs. minutes stemming from prior use of `operator.add`. See [issue #761](https://github.com/biocore/biom-format/issues/761). * `Table.align_to` for aligning one table to another. This is useful in multi-omic analyses where multiple preparations have been performed on the sample physical samples. This is essentially a helper method around `Table.sort_order`. See [issue #747](https://github.com/biocore/biom-format/issues/747). * Added additional sanity checks when calling `Table.to_hdf5`, see [PR #769](https://github.com/biocore/biom-format/pull/769). * `Table.subsample()` can optionally perform subsampling with replacement. See [issue #774](https://github.com/biocore/biom-format/issues/774). @@ -47,7 +49,7 @@ New Features: * `Table.from_hdf5` now supports a rapid subset in the event that metadata is not needed. In benchmarking against the Earth Microbiome Project BIOM table, the reduction in runtime was multiple orders of magnitude while additionally - preserving substantial memory. + preserving substantial memory. * `Table.rankdata` has been added to convert values to ranked abundances on either axis. See [issue #645](https://github.com/biocore/biom-format/issues/639). * Format of numbers in ``biom summarize-table`` output is now more readable and localized. See [issue #679](https://github.com/biocore/biom-format/issues/679). @@ -105,8 +107,8 @@ Bug fixes: * `biom --version` now prints the software version (previously the individual commands did this, but not the base command). * `Table.vlen_list_of_str_formatter` was considering a `str` to be valid for - formatting resulting in an obscure error when a `str`, as opposed to a - `list` of `str`, was used for taxonomy. See + formatting resulting in an obscure error when a `str`, as opposed to a + `list` of `str`, was used for taxonomy. See [issue #709](https://github.com/biocore/biom-format/issues/709). biom 2.1.4 From f9676d8ea4b98873ec242db661d28a06a028af03 Mon Sep 17 00:00:00 2001 From: Christian Diener Date: Thu, 7 Mar 2019 13:19:37 -0800 Subject: [PATCH 4/4] fix test for Python 2 --- biom/tests/test_table.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/biom/tests/test_table.py b/biom/tests/test_table.py index 5a881bdc..2a88f62b 100644 --- a/biom/tests/test_table.py +++ b/biom/tests/test_table.py @@ -1482,7 +1482,7 @@ def test_to_dataframe(self): def test_to_dataframe_is_sparse(self): df = example_table.to_dataframe() - density = (example_table.matrix_data.getnnz() / + density = (float(example_table.matrix_data.getnnz()) / np.prod(example_table.shape)) assert np.allclose(df.density, density)