Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Align methods for #858 #859

Merged
merged 18 commits into from
Jul 28, 2021
Merged
Show file tree
Hide file tree
Changes from 9 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions ChangeLog.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,10 @@ Bug fixes:

* During deployment testing for QIIME 2 2020.11, it was observed that certain combinations of hdf5 or h5py dependencies can result in metadata strings parsing as ASCII rather than UTF-8. Parse of BIOM-Format 2.1.0 files now normalize metadata strings as UTF-8, see [PR #853](https://github.com/biocore/biom-format/pull/853).

New Features

* Added support for aligning dataframes and trees against biom tables with `Table.align_to_dataframe` and `Table.align_tree`. see [PR #859](https://github.com/biocore/biom-format/pull/859)

biom 2.1.9
----------

Expand Down
57 changes: 57 additions & 0 deletions biom/table.py
Original file line number Diff line number Diff line change
Expand Up @@ -948,6 +948,63 @@ def _get_col(self, col_idx):
self._data = self._data.tocsc()
return self._data.getcol(col_idx)

def align_to_dataframe(self, metadata, axis='sample'):
""" Aligns dataframe against biom table, only keeping common ids.

Parameters
----------
metadata : pd.DataFrame
The metadata, either respect to the sample metadata
or observation metadata.
axis : {'sample', 'observation'}
The axis on which to operate.

Returns
-------
biom.Table
A filtered biom table.
pd.DataFrame
A filtered metadata table.
"""
ids = set(self.ids(axis=axis)) & set(metadata.index)
if len(ids) == 0:
raise TableException("No common ids between table and dataframe.")

def filter_f(v, i, m): return i in ids

t = self.filter(filter_f, axis=axis, inplace=False)
t.remove_empty()
mortonjt marked this conversation as resolved.
Show resolved Hide resolved
md = metadata.loc[t.ids(axis=axis)]
return t, md

def align_tree(self, tree, axis='observation'):
""" Aligns biom table against tree, only keeping common ids.

Parameters
----------
tree : skbio.TreeNode
The tree object, either respect to the sample metadata
or observation metadata.
axis : {'sample', 'observation'}
The axis on which to operate.

Returns
-------
biom.Table
A filtered biom table.
skbio.TreeNode
A filtered skbio TreeNode object.
"""
tips = {x.name for x in tree.tips()}
common_tips = tips & set(self.ids(axis=axis))
mortonjt marked this conversation as resolved.
Show resolved Hide resolved
_tree = tree.shear(names=common_tips)
_table = self.filter(common_tips, axis=axis, inplace=False)
_table.remove_empty()
_tree.prune()
order = [n.name for n in _tree.tips()]
_table = _table.sort_order(order, axis=axis)
return _table, _tree

def reduce(self, f, axis):
"""Reduce over axis using function `f`

Expand Down
196 changes: 196 additions & 0 deletions biom/tests/test_table.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,12 @@
except ImportError:
HAVE_ANNDATA = False

try:
import skbio
HAVE_SKBIO = False
mortonjt marked this conversation as resolved.
Show resolved Hide resolved
except ImportError:
HAVE_SKBIO = False

__author__ = "Daniel McDonald"
__copyright__ = "Copyright 2011-2017, The BIOM Format Development Team"
__credits__ = ["Daniel McDonald", "Jai Ram Rideout", "Justin Kuczynski",
Expand Down Expand Up @@ -1815,6 +1821,196 @@ def test_add_sample_metadata_two_entries(self):
self.assertEqual(t._sample_metadata[2]['D'], ['A', 'C'])
self.assertEqual(t._sample_metadata[3]['D'], ['A', 'D'])

def test_align_to_dataframe_samples(self):
table = Table(np.array([[0, 0, 1, 1],
[2, 2, 4, 4],
[5, 5, 3, 3],
[0, 0, 0, 1]]).T,
['o1', 'o2', 'o3', 'o4'],
['s1', 's2', 's3', 's4'])
metadata = pd.DataFrame([['a', 'control'],
['c', 'diseased'],
['b', 'control']],
index=['s1', 's3', 's2'],
columns=['Barcode', 'Treatment'])
exp_table = Table(np.array([[0, 0, 1, 1],
[2, 2, 4, 4],
[5, 5, 3, 3]]).T,
['o1', 'o2', 'o3', 'o4'],
['s1', 's2', 's3'])
exp_metadata = pd.DataFrame([['a', 'control'],
['b', 'control'],
['c', 'diseased']],
index=['s1', 's2', 's3'],
columns=['Barcode', 'Treatment'])
res_table, res_metadata = table.align_to_dataframe(metadata)
pdt.assert_frame_equal(exp_metadata, res_metadata)
self.assertEqual(res_table.descriptive_equality(exp_table),
'Tables appear equal')

def test_align_to_dataframe_observations(self):
table = Table(np.array([[0, 0, 1, 1],
[2, 2, 4, 4],
[5, 5, 3, 3],
[0, 0, 0, 1]]),
['s1', 's2', 's3', 's4'],
['o1', 'o2', 'o3', 'o4'])
metadata = pd.DataFrame([['a', 'control'],
['c', 'diseased'],
['b', 'control']],
index=['s1', 's3', 's2'],
mortonjt marked this conversation as resolved.
Show resolved Hide resolved
columns=['Barcode', 'Treatment'])
exp_table = Table(np.array([[0, 0, 1, 1],
[2, 2, 4, 4],
[5, 5, 3, 3]]),
['s1', 's2', 's3'],
['o1', 'o2', 'o3', 'o4'])
exp_metadata = pd.DataFrame([['a', 'control'],
['b', 'control'],
['c', 'diseased']],
index=['s1', 's2', 's3'],
columns=['Barcode', 'Treatment'])
res_table, res_metadata = table.align_to_dataframe(
metadata, axis='observation')
pdt.assert_frame_equal(exp_metadata, res_metadata)
self.assertEqual(res_table.descriptive_equality(exp_table),
'Tables appear equal')

def test_align_to_dataframe_samples_remove_empty(self):
table = Table(np.array([[0, 0, 1, 0],
[2, 2, 4, 0],
[5, 5, 3, 0],
[0, 0, 0, 1]]).T,
['o1', 'o2', 'o3', 'o4'],
['s1', 's2', 's3', 's4'])
metadata = pd.DataFrame([['a', 'control'],
['c', 'diseased'],
['b', 'control']],
index=['s1', 's3', 's2'],
columns=['Barcode', 'Treatment'])
exp_table = Table(np.array([[0, 0, 1],
[2, 2, 4],
[5, 5, 3]]).T,
['o1', 'o2', 'o3'],
['s1', 's2', 's3'])
exp_metadata = pd.DataFrame([['a', 'control'],
['b', 'control'],
['c', 'diseased']],
index=['s1', 's2', 's3'],
columns=['Barcode', 'Treatment'])
res_table, res_metadata = table.align_to_dataframe(metadata)
pdt.assert_frame_equal(exp_metadata, res_metadata)
self.assertEqual(res_table.descriptive_equality(exp_table),
'Tables appear equal')

def test_align_to_dataframe_samples_empty(self):
table = Table(np.array([[0, 0, 1, 0],
[2, 2, 4, 0],
[5, 5, 3, 0],
[0, 0, 0, 1]]).T,
['o1', 'o2', 'o3', 'o4'],
['s1', 's2', 's3', 's4'])
metadata = pd.DataFrame([['a', 'control'],
['c', 'diseased'],
['b', 'control']],
index=['s1', 's3', 's2'],
columns=['Barcode', 'Treatment'])
exp_table = Table(np.array([[0, 0, 1],
[2, 2, 4],
[5, 5, 3]]).T,
['o1', 'o2', 'o3'],
['s1', 's2', 's3'])
exp_metadata = pd.DataFrame([['a', 'control'],
['b', 'control'],
['c', 'diseased']],
index=['s1', 's2', 's3'],
columns=['Barcode', 'Treatment'])
res_table, res_metadata = table.align_to_dataframe(metadata)
pdt.assert_frame_equal(exp_metadata, res_metadata)
self.assertEqual(res_table.descriptive_equality(exp_table),
'Tables appear equal')

def test_align_to_dataframe_samples_no_common_ids(self):
table = Table(np.array([[0, 0, 1, 0],
[2, 2, 4, 0],
[5, 5, 3, 0],
[0, 0, 0, 1]]),
['s1', 's2', 's3', 's4'],
mortonjt marked this conversation as resolved.
Show resolved Hide resolved
['o1', 'o2', 'o3', 'o4'])
metadata = pd.DataFrame([['a', 'control'],
['c', 'diseased'],
['b', 'control']],
index=['s1', 's3', 's2'],
columns=['Barcode', 'Treatment'])
with self.assertRaises(TableException):
table.align_to_dataframe(metadata)

@pytest.mark.skipif(not HAVE_SKBIO, reason="skbio not installed")
def test_align_tree_intersect_tips(self):
# there are less tree tips than observations
table = Table(np.array([[0, 0, 1, 1],
[2, 3, 4, 4],
[5, 5, 3, 3],
[0, 0, 0, 1]]).T,
['a', 'b', 'c', 'd'],
['s1', 's2', 's3', 's4'])
tree = skbio.TreeNode.read([u"((a,b)f,d)r;"])
exp_table = Table(np.array([[0, 0, 1],
[2, 3, 4],
[5, 5, 3],
[0, 0, 1]]).T,
['a', 'b', 'd'],
['s1', 's2', 's3', 's4'])
exp_tree = tree
res_table, res_tree = table.align_tree(tree)
self.assertEqual(res_table.descriptive_equality(exp_table),
'Tables appear equal')
self.assertEqual(str(exp_tree), str(res_tree))

@pytest.mark.skipif(not HAVE_SKBIO, reason="skbio not installed")
def test_align_tree_intersect_obs(self):
# table has less observations than tree tips
table = Table(np.array([[0, 0, 1],
[2, 3, 4],
[5, 5, 3],
[0, 0, 1]]).T,
['a', 'b', 'd'],
['s1', 's2', 's3', 's4'])
tree = skbio.TreeNode.read([u"(((a,b)f, c),d)r;"])
exp_table = Table(np.array([[1, 0, 0],
[4, 2, 3],
[3, 5, 5],
[1, 0, 0]]).T,
['d', 'a', 'b'],
['s1', 's2', 's3', 's4'])
exp_tree = skbio.TreeNode.read([u"(d,(a,b)f)r;"])
res_table, res_tree = table.align_tree(tree)
self.assertEqual(res_table.descriptive_equality(exp_table),
'Tables appear equal')
self.assertEqual(str(exp_tree), str(res_tree))
mortonjt marked this conversation as resolved.
Show resolved Hide resolved

@pytest.mark.skipif(not HAVE_SKBIO, reason="skbio not installed")
def test_align_tree_sample(self):
# table has less observations than tree tips
table = Table(np.array([[0, 0, 1],
[2, 3, 4],
[5, 5, 3],
[0, 0, 1]]),
['s1', 's2', 's3', 's4'],
['a', 'b', 'd'])
tree = skbio.TreeNode.read([u"(((a,b)f, c),d)r;"])
exp_table = Table(np.array([[1, 0, 0],
[4, 2, 3],
[3, 5, 5],
[1, 0, 0]]),
['s1', 's2', 's3', 's4'],
mortonjt marked this conversation as resolved.
Show resolved Hide resolved
['d', 'a', 'b'])
exp_tree = skbio.TreeNode.read([u"(d,(a,b)f)r;"])
res_table, res_tree = table.align_tree(tree, axis='sample')
self.assertEqual(res_table.descriptive_equality(exp_table),
'Tables appear equal')
self.assertEqual(str(exp_tree), str(res_tree))

def test_get_value_by_ids(self):
"""Return the value located in the matrix by the ids"""
t1 = Table(np.array([[5, 6], [7, 8]]), [3, 4], [1, 2])
Expand Down