diff --git a/CHANGELOG.md b/CHANGELOG.md index 27164b8..08e45ee 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,7 @@ ## Version 0.0.2 (changes since 0.0.2 go here) ### Features +* Adding in utility functions for handing feature tables, metadata, and trees. [#12](https://github.com/biocore/gneiss/pull/12) * Adding GPL license. ### Bug fixes diff --git a/gneiss/tests/test_util.py b/gneiss/tests/test_util.py new file mode 100644 index 0000000..fecf40f --- /dev/null +++ b/gneiss/tests/test_util.py @@ -0,0 +1,313 @@ +# ---------------------------------------------------------------------------- +# Copyright (c) 2016--, gneiss development team. +# +# Distributed under the terms of the GPLv3 License. +# +# The full license is in the file COPYING.txt, distributed with this software. +# ---------------------------------------------------------------------------- + +import unittest +import pandas as pd +import pandas.util.testing as pdt +from skbio import TreeNode +from gneiss.util import match, match_tips, rename_internal_nodes + + +class TestUtil(unittest.TestCase): + + def test_match(self): + table = pd.DataFrame([[0, 0, 1, 1], + [2, 2, 4, 4], + [5, 5, 3, 3], + [0, 0, 0, 1]], + index=['s1', 's2', 's3', 's4'], + columns=['o1', 'o2', 'o3', 'o4']) + metadata = pd.DataFrame([['a', 'control'], + ['b', 'control'], + ['c', 'diseased'], + ['d', 'diseased']], + index=['s1', 's2', 's3', 's4'], + columns=['Barcode', 'Treatment']) + exp_table, exp_metadata = table, metadata + res_table, res_metadata = match(table, metadata) + + # make sure that the metadata and table indeces match + pdt.assert_index_equal(res_table.index, res_metadata.index) + + res_table = res_table.sort_index() + exp_table = exp_table.sort_index() + + res_metadata = res_metadata.sort_index() + exp_metadata = exp_metadata.sort_index() + + pdt.assert_frame_equal(exp_table, res_table) + pdt.assert_frame_equal(exp_metadata, res_metadata) + + def test_match_immutable(self): + # tests to make sure that the original tables don't change. + table = pd.DataFrame([[0, 0, 1, 1], + [2, 2, 4, 4], + [5, 5, 3, 3], + [0, 0, 0, 1]], + index=['s1', 's2', 's3', 's4'], + columns=['o1', 'o2', 'o3', 'o4']) + metadata = pd.DataFrame([['a', 'control'], + ['c', 'diseased'], + ['b', 'control']], + index=['s1', 's3', 's2'], + columns=['Barcode', 'Treatment']) + + exp_table = pd.DataFrame([[0, 0, 1, 1], + [2, 2, 4, 4], + [5, 5, 3, 3], + [0, 0, 0, 1]], + index=['s1', 's2', 's3', 's4'], + columns=['o1', 'o2', 'o3', 'o4']) + exp_metadata = pd.DataFrame([['a', 'control'], + ['c', 'diseased'], + ['b', 'control']], + index=['s1', 's3', 's2'], + columns=['Barcode', 'Treatment']) + match(table, metadata) + pdt.assert_frame_equal(table, exp_table) + pdt.assert_frame_equal(metadata, exp_metadata) + + def test_match_duplicate(self): + table1 = pd.DataFrame([[0, 0, 1, 1], + [2, 2, 4, 4], + [5, 5, 3, 3], + [0, 0, 0, 1]], + index=['s2', 's2', 's3', 's4'], + columns=['o1', 'o2', 'o3', 'o4']) + metadata1 = pd.DataFrame([['a', 'control'], + ['b', 'control'], + ['c', 'diseased'], + ['d', 'diseased']], + index=['s1', 's2', 's3', 's4'], + columns=['Barcode', 'Treatment']) + + table2 = pd.DataFrame([[0, 0, 1, 1], + [2, 2, 4, 4], + [5, 5, 3, 3], + [0, 0, 0, 1]], + index=['s1', 's2', 's3', 's4'], + columns=['o1', 'o2', 'o3', 'o4']) + metadata2 = pd.DataFrame([['a', 'control'], + ['b', 'control'], + ['c', 'diseased'], + ['d', 'diseased']], + index=['s1', 's1', 's3', 's4'], + columns=['Barcode', 'Treatment']) + + with self.assertRaises(ValueError): + match(table1, metadata1) + with self.assertRaises(ValueError): + match(table2, metadata2) + + def test_match_scrambled(self): + table = pd.DataFrame([[0, 0, 1, 1], + [2, 2, 4, 4], + [5, 5, 3, 3], + [0, 0, 0, 1]], + index=['s1', 's2', 's3', 's4'], + columns=['o1', 'o2', 'o3', 'o4']) + metadata = pd.DataFrame([['a', 'control'], + ['c', 'diseased'], + ['b', 'control'], + ['d', 'diseased']], + index=['s1', 's3', 's2', 's4'], + columns=['Barcode', 'Treatment']) + exp_table = table + exp_metadata = pd.DataFrame([['a', 'control'], + ['b', 'control'], + ['c', 'diseased'], + ['d', 'diseased']], + index=['s1', 's2', 's3', 's4'], + columns=['Barcode', 'Treatment']) + + res_table, res_metadata = match(table, metadata) + # make sure that the metadata and table indeces match + pdt.assert_index_equal(res_table.index, res_metadata.index) + + res_table = res_table.sort_index() + exp_table = exp_table.sort_index() + + res_metadata = res_metadata.sort_index() + exp_metadata = exp_metadata.sort_index() + + pdt.assert_frame_equal(exp_table, res_table) + pdt.assert_frame_equal(exp_metadata, res_metadata) + + def test_match_intersect(self): + table = pd.DataFrame([[0, 0, 1, 1], + [2, 2, 4, 4], + [5, 5, 3, 3], + [0, 0, 0, 1]], + index=['s1', 's2', 's3', 's4'], + columns=['o1', 'o2', 'o3', 'o4']) + metadata = pd.DataFrame([['a', 'control'], + ['c', 'diseased'], + ['b', 'control']], + index=['s1', 's3', 's2'], + columns=['Barcode', 'Treatment']) + + exp_table = pd.DataFrame([[0, 0, 1, 1], + [2, 2, 4, 4], + [5, 5, 3, 3]], + index=['s1', 's2', 's3'], + columns=['o1', 'o2', 'o3', 'o4']) + + exp_metadata = pd.DataFrame([['a', 'control'], + ['b', 'control'], + ['c', 'diseased']], + index=['s1', 's2', 's3'], + columns=['Barcode', 'Treatment']) + + res_table, res_metadata = match(table, metadata) + # sort for comparison, since the match function + # scrambles the names due to hashing. + res_table = res_table.sort_index() + res_metadata = res_metadata.sort_index() + pdt.assert_frame_equal(exp_table, res_table) + pdt.assert_frame_equal(exp_metadata, res_metadata) + + def test_match_tips(self): + table = pd.DataFrame([[0, 0, 1, 1], + [2, 2, 4, 4], + [5, 5, 3, 3], + [0, 0, 0, 1]], + index=['s1', 's2', 's3', 's4'], + columns=['a', 'b', 'c', 'd']) + tree = TreeNode.read([u"(((a,b)f, c),d)r;"]) + exp_table, exp_tree = table, tree + res_table, res_tree = match_tips(table, tree) + pdt.assert_frame_equal(exp_table, res_table) + self.assertEqual(str(exp_tree), str(res_tree)) + + def test_match_tips_scrambled_tips(self): + table = pd.DataFrame([[0, 0, 1, 1], + [2, 3, 4, 4], + [5, 5, 3, 3], + [0, 0, 0, 1]], + index=['s1', 's2', 's3', 's4'], + columns=['a', 'b', 'c', 'd']) + tree = TreeNode.read([u"(((b,a)f, c),d)r;"]) + exp_tree = tree + exp_table = pd.DataFrame([[0, 0, 1, 1], + [3, 2, 4, 4], + [5, 5, 3, 3], + [0, 0, 0, 1]], + index=['s1', 's2', 's3', 's4'], + columns=['b', 'a', 'c', 'd']) + + res_table, res_tree = match_tips(table, tree) + pdt.assert_frame_equal(exp_table, res_table) + self.assertEqual(str(exp_tree), str(res_tree)) + + def test_match_tips_scrambled_columns(self): + table = pd.DataFrame([[0, 0, 1, 1], + [3, 2, 4, 4], + [5, 5, 3, 3], + [0, 0, 0, 1]], + index=['s1', 's2', 's3', 's4'], + columns=['b', 'a', 'c', 'd']) + tree = TreeNode.read([u"(((a,b)f, c),d)r;"]) + exp_tree = tree + exp_table = pd.DataFrame([[0, 0, 1, 1], + [2, 3, 4, 4], + [5, 5, 3, 3], + [0, 0, 0, 1]], + index=['s1', 's2', 's3', 's4'], + columns=['a', 'b', 'c', 'd']) + + res_table, res_tree = match_tips(table, tree) + pdt.assert_frame_equal(exp_table, res_table) + self.assertEqual(str(exp_tree), str(res_tree)) + + def test_match_tips_intersect_tips(self): + # there are less tree tips than table columns + table = pd.DataFrame([[0, 0, 1, 1], + [2, 3, 4, 4], + [5, 5, 3, 3], + [0, 0, 0, 1]], + index=['s1', 's2', 's3', 's4'], + columns=['a', 'b', 'c', 'd']) + tree = TreeNode.read([u"((a,b)f,d)r;"]) + exp_table = pd.DataFrame([[0, 0, 1], + [2, 3, 4], + [5, 5, 3], + [0, 0, 1]], + index=['s1', 's2', 's3', 's4'], + columns=['a', 'b', 'd']) + exp_tree = tree + res_table, res_tree = match_tips(table, tree) + pdt.assert_frame_equal(exp_table, res_table) + self.assertEqual(str(exp_tree), str(res_tree)) + + def test_match_tips_intersect_columns(self): + # table has less columns than tree tips + table = pd.DataFrame([[0, 0, 1], + [2, 3, 4], + [5, 5, 3], + [0, 0, 1]], + index=['s1', 's2', 's3', 's4'], + columns=['a', 'b', 'd']) + tree = TreeNode.read([u"(((a,b)f, c),d)r;"]) + exp_table = pd.DataFrame([[1, 0, 0], + [4, 2, 3], + [3, 5, 5], + [1, 0, 0]], + index=['s1', 's2', 's3', 's4'], + columns=['d', 'a', 'b']) + exp_tree = TreeNode.read([u"(d,(a,b)f)r;"]) + res_table, res_tree = match_tips(table, tree) + pdt.assert_frame_equal(exp_table, res_table) + self.assertEqual(str(exp_tree), str(res_tree)) + + def test_match_tips_intersect_tree_immutable(self): + # tests to see if tree chnages. + table = pd.DataFrame([[0, 0, 1], + [2, 3, 4], + [5, 5, 3], + [0, 0, 1]], + index=['s1', 's2', 's3', 's4'], + columns=['a', 'b', 'd']) + tree = TreeNode.read([u"(((a,b)f, c),d)r;"]) + match_tips(table, tree) + self.assertEqual(str(tree), u"(((a,b)f,c),d)r;\n") + + def test_rename_internal_nodes(self): + tree = TreeNode.read([u"(((a,b), c),d)r;"]) + exp_tree = TreeNode.read([u"(((a,b)y2, c)y1,d)y0;"]) + res_tree = rename_internal_nodes(tree) + self.assertEqual(str(exp_tree), str(res_tree)) + + def test_rename_internal_nodes_names(self): + tree = TreeNode.read([u"(((a,b), c),d)r;"]) + exp_tree = TreeNode.read([u"(((a,b)ab, c)abc,d)r;"]) + res_tree = rename_internal_nodes(tree, ['r', 'abc', 'ab']) + self.assertEqual(str(exp_tree), str(res_tree)) + + def test_rename_internal_nodes_names_mismatch(self): + tree = TreeNode.read([u"(((a,b), c),d)r;"]) + with self.assertRaises(ValueError): + rename_internal_nodes(tree, ['r', 'abc']) + + def test_rename_internal_nodes_warning(self): + tree = TreeNode.read([u"(((a,b)y2, c),d)r;"]) + with self.assertWarns(Warning): + rename_internal_nodes(tree) + + def test_rename_internal_nodes_immutable(self): + tree = TreeNode.read([u"(((a,b)y2, c),d)r;"]) + rename_internal_nodes(tree) + self.assertEqual(str(tree), "(((a,b)y2,c),d)r;\n") + + def test_rename_internal_nodes_mutable(self): + tree = TreeNode.read([u"(((a,b)y2, c),d)r;"]) + rename_internal_nodes(tree, inplace=True) + self.assertEqual(str(tree), "(((a,b)y2,c)y1,d)y0;\n") + + +if __name__ == '__main__': + unittest.main() diff --git a/gneiss/util.py b/gneiss/util.py new file mode 100644 index 0000000..98e7031 --- /dev/null +++ b/gneiss/util.py @@ -0,0 +1,162 @@ +# ---------------------------------------------------------------------------- +# Copyright (c) 2016--, gneiss development team. +# +# Distributed under the terms of the GPLv3 License. +# +# The full license is in the file COPYING.txt, distributed with this software. +# ---------------------------------------------------------------------------- +import warnings + + +def match(table, metadata): + """ Matches samples between a contingency table and a metadata table. + + Sorts samples in metadata and contingency table in the same order. + If there are sames contained in the contigency table, but not in metadata + or vice versa, the intersection of samples in the contingency table and the + metadata table will returned. + + Parameters + ---------- + table : pd.DataFrame + Contingency table where samples correspond to rows and + features correspond to columns. + metadata: pd.DataFrame + Metadata table where samples correspond to rows and + explanatory metadata variables correspond to columns. + + Returns + ------- + pd.DataFrame : + Filtered contingency table. + pd.DataFrame : + Filtered metadata table + + Raises + ------ + ValueError: + Raised if duplicate sample ids are present in `table`. + ValueError: + Raised if duplicate sample ids are present in `metadata`. + ValueError: + Raised if `table` and `metadata` have incompatible sizes. + + """ + subtableids = set(table.index) + submetadataids = set(metadata.index) + if len(subtableids) != len(table.index): + raise ValueError("`table` has duplicate sample ids.") + if len(submetadataids) != len(metadata.index): + raise ValueError("`metadata` has duplicate sample ids.") + + idx = subtableids & submetadataids + subtable = table.loc[idx] + submetadata = metadata.loc[idx] + return subtable, submetadata + + +def match_tips(table, tree): + """ Returns the contingency table and tree with matched tips. + + Sorts the columns of the contingency table to match the tips in + the tree. The ordering of the tips is in post-traversal order. + + If the tree is multi-furcating, then the tree is reduced to a + bifurcating tree by randomly inserting internal nodes. + + The intersection of samples in the contingency table and the + tree will returned. + + Parameters + ---------- + table : pd.DataFrame + Contingency table where samples correspond to rows and + features correspond to columns. + tree : skbio.TreeNode + Tree object where the leafs correspond to the features. + + Returns + ------- + pd.DataFrame : + Subset of the original contingency table with the common features. + skbio.TreeNode : + Sub-tree with the common features. + + Raises + ------ + ValueError: + Raised if `table` and `tree` have incompatible sizes. + + See Also + -------- + skbio.TreeNode.bifurcate + skbio.TreeNode.tips + """ + tips = [x.name for x in tree.tips()] + common_tips = list(set(tips) & set(table.columns)) + + _table = table.loc[:, common_tips] + _tree = tree.shear(names=common_tips) + + _tree.bifurcate() + _tree.prune() + sorted_features = [n.name for n in _tree.tips()] + _table = _table.reindex_axis(sorted_features, axis=1) + + return _table, _tree + + +def rename_internal_nodes(tree, names=None, inplace=False): + """ Names the internal according to level ordering. + + The tree will be traversed in level order (i.e. top-down, left to right). + If `names` is not specified, the node with the smallest label (y0) + will be located at the root of the tree, and the node with the largest + label will be located at bottom right corner of the tree. + + Parameters + ---------- + tree : skbio.TreeNode + Tree object where the leafs correspond to the features. + names : list, optional + List of labels to rename the tip names. It is assumed that the + names are listed in level ordering, and the length of the list + is at least as long as the number of internal nodes. + inplace : bool, optional + Specifies if the operation should be done on the original tree or not. + + Returns + ------- + skbio.TreeNode + Tree with renamed internal nodes. + + Raises + ------ + ValueError: + Raised if `tree` and `name` have incompatible sizes. + """ + if inplace: + _tree = tree + else: + _tree = tree.copy() + + non_tips = [n for n in _tree.levelorder() if not n.is_tip()] + if names is not None and len(non_tips) != len(names): + raise ValueError("`_tree` and `names` have incompatible sizes, " + "`_tree` has %d tips, `names` has %d elements." % + (len(non_tips), len(names))) + + i = 0 + for n in _tree.levelorder(): + if not n.is_tip(): + if names is None: + label = 'y%i' % i + else: + label = names[i] + if n.name is not None and label == n.name: + warnings.warn("Warning. Internal node (%s) has been replaced " + "with (%s)" % (n.name, label)) + + n.name = label + i += 1 + return _tree diff --git a/setup.py b/setup.py index 0f081de..28a305e 100644 --- a/setup.py +++ b/setup.py @@ -35,7 +35,7 @@ def finalize_options(self): extensions = cythonize(extensions) classes = """ - Development Status :: 0 - pre-alpha + Development Status :: 2 - Pre-Alpha License :: OSI Approved :: BSD License Topic :: Software Development :: Libraries Topic :: Scientific/Engineering