From 362fe9e470f88c9b2b84d744863885033170d126 Mon Sep 17 00:00:00 2001 From: Jamie Morton Date: Mon, 18 Jul 2016 17:43:13 -0700 Subject: [PATCH 01/12] ENH: Adding util functions Correcting documentation in setup.py --- gneiss/tests/test_util.py | 241 ++++++++++++++++++++++++++++++++++++++ gneiss/util.py | 121 +++++++++++++++++++ setup.py | 2 +- 3 files changed, 363 insertions(+), 1 deletion(-) create mode 100644 gneiss/tests/test_util.py create mode 100644 gneiss/util.py diff --git a/gneiss/tests/test_util.py b/gneiss/tests/test_util.py new file mode 100644 index 0000000..df6a6ee --- /dev/null +++ b/gneiss/tests/test_util.py @@ -0,0 +1,241 @@ +import unittest +import pandas as pd +import pandas.util.testing as pdt +from skbio import TreeNode +from gneiss.util import match, match_tips, rename_tips + + +class TestUtil(unittest.TestCase): + + def test_match(self): + table = pd.DataFrame([[0, 0, 1, 1], + [2, 2, 4, 4], + [5, 5, 3, 3], + [0, 0, 0, 1]], + index=['s1', 's2', 's3', 's4'], + columns=['o1', 'o2', 'o3', 'o4']) + metadata = pd.DataFrame([['a', 'control'], + ['b', 'control'], + ['c', 'diseased'], + ['d', 'diseased']], + index=['s1', 's2', 's3', 's4'], + columns=['Barcode', 'Treatment']) + exp_table, exp_metadata = table, metadata + res_table, res_metadata = match(table, metadata) + pdt.assert_frame_equal(exp_table, res_table) + pdt.assert_frame_equal(exp_metadata, res_metadata) + + + def test_match_scrambled(self): + table = pd.DataFrame([[0, 0, 1, 1], + [2, 2, 4, 4], + [5, 5, 3, 3], + [0, 0, 0, 1]], + index=['s1', 's2', 's3', 's4'], + columns=['o1', 'o2', 'o3', 'o4']) + metadata = pd.DataFrame([['a', 'control'], + ['c', 'diseased'], + ['b', 'control'], + ['d', 'diseased']], + index=['s1', 's3', 's2', 's4'], + columns=['Barcode', 'Treatment']) + exp_table = table + exp_metadata = pd.DataFrame([['a', 'control'], + ['b', 'control'], + ['c', 'diseased'], + ['d', 'diseased']], + index=['s1', 's2', 's3', 's4'], + columns=['Barcode', 'Treatment']) + + res_table, res_metadata = match(table, metadata) + pdt.assert_frame_equal(exp_table, res_table) + pdt.assert_frame_equal(exp_metadata, res_metadata) + + def test_match_intersect(self): + table = pd.DataFrame([[0, 0, 1, 1], + [2, 2, 4, 4], + [5, 5, 3, 3], + [0, 0, 0, 1]], + index=['s1', 's2', 's3', 's4'], + columns=['o1', 'o2', 'o3', 'o4']) + metadata = pd.DataFrame([['a', 'control'], + ['c', 'diseased'], + ['b', 'control']], + index=['s1', 's3', 's2'], + columns=['Barcode', 'Treatment']) + + exp_table = pd.DataFrame([[0, 0, 1, 1], + [2, 2, 4, 4], + [5, 5, 3, 3]], + index=['s1', 's2', 's3'], + columns=['o1', 'o2', 'o3', 'o4']) + + exp_metadata = pd.DataFrame([['a', 'control'], + ['b', 'control'], + ['c', 'diseased']], + index=['s1', 's2', 's3'], + columns=['Barcode', 'Treatment']) + + res_table, res_metadata = match(table, metadata, intersect=True) + pdt.assert_frame_equal(exp_table, res_table) + pdt.assert_frame_equal(exp_metadata, res_metadata) + + def test_match_mismatch(self): + table = pd.DataFrame([[0, 0, 1, 1], + [2, 2, 4, 4], + [5, 5, 3, 3], + [0, 0, 0, 1]], + index=['s1', 's2', 's3', 's4'], + columns=['o1', 'o2', 'o3', 'o4']) + metadata = pd.DataFrame([['a', 'control'], + ['c', 'diseased'], + ['b', 'control']], + index=['s1', 's3', 's2'], + columns=['Barcode', 'Treatment']) + + exp_table = pd.DataFrame([[0, 0, 1, 1], + [2, 2, 4, 4], + [5, 5, 3, 3]], + index=['s1', 's2', 's3'], + columns=['o1', 'o2', 'o3', 'o4']) + + exp_metadata = pd.DataFrame([['a', 'control'], + ['b', 'control'], + ['c', 'diseased']], + index=['s1', 's2', 's3'], + columns=['Barcode', 'Treatment']) + with self.assertRaises(ValueError): + match(table, metadata) + + def test_match_tips(self): + table = pd.DataFrame([[0, 0, 1, 1], + [2, 2, 4, 4], + [5, 5, 3, 3], + [0, 0, 0, 1]], + index=['s1', 's2', 's3', 's4'], + columns=['a', 'b', 'c', 'd']) + tree = TreeNode.read([u"(((a,b)f, c),d)r;"]) + exp_table, exp_tree = table, tree + res_table, res_tree = match_tips(table, tree) + pdt.assert_frame_equal(exp_table, res_table) + self.assertEqual(str(exp_tree), str(res_tree)) + + def test_match_tips_scrambled_tips(self): + table = pd.DataFrame([[0, 0, 1, 1], + [2, 3, 4, 4], + [5, 5, 3, 3], + [0, 0, 0, 1]], + index=['s1', 's2', 's3', 's4'], + columns=['a', 'b', 'c', 'd']) + tree = TreeNode.read([u"(((b,a)f, c),d)r;"]) + exp_tree = tree + exp_table = pd.DataFrame([[0, 0, 1, 1], + [3, 2, 4, 4], + [5, 5, 3, 3], + [0, 0, 0, 1]], + index=['s1', 's2', 's3', 's4'], + columns=['b', 'a', 'c', 'd']) + + res_table, res_tree = match_tips(table, tree) + pdt.assert_frame_equal(exp_table, res_table) + self.assertEqual(str(exp_tree), str(res_tree)) + + def test_match_tips_scrambled_columns(self): + table = pd.DataFrame([[0, 0, 1, 1], + [3, 2, 4, 4], + [5, 5, 3, 3], + [0, 0, 0, 1]], + index=['s1', 's2', 's3', 's4'], + columns=['b', 'a', 'c', 'd']) + tree = TreeNode.read([u"(((a,b)f, c),d)r;"]) + exp_tree = tree + exp_table = pd.DataFrame([[0, 0, 1, 1], + [2, 3, 4, 4], + [5, 5, 3, 3], + [0, 0, 0, 1]], + index=['s1', 's2', 's3', 's4'], + columns=['a', 'b', 'c', 'd']) + + res_table, res_tree = match_tips(table, tree) + pdt.assert_frame_equal(exp_table, res_table) + self.assertEqual(str(exp_tree), str(res_tree)) + + + def test_match_tips_intersect_tips(self): + # there are less tree tips than table columns + table = pd.DataFrame([[0, 0, 1, 1], + [2, 3, 4, 4], + [5, 5, 3, 3], + [0, 0, 0, 1]], + index=['s1', 's2', 's3', 's4'], + columns=['a', 'b', 'c', 'd']) + tree = TreeNode.read([u"((a,b)f,d)r;"]) + exp_table = pd.DataFrame([[0, 0, 1], + [2, 3, 4], + [5, 5, 3], + [0, 0, 1]], + index=['s1', 's2', 's3', 's4'], + columns=['a', 'b', 'd']) + exp_tree = tree + res_table, res_tree = match_tips(table, tree, intersect=True) + pdt.assert_frame_equal(exp_table, res_table) + self.assertEqual(str(exp_tree), str(res_tree)) + + def test_match_tips_intersect_columns(self): + # table has less columns than tree tips + table = pd.DataFrame([[0, 0, 1], + [2, 3, 4], + [5, 5, 3], + [0, 0, 1]], + index=['s1', 's2', 's3', 's4'], + columns=['a', 'b', 'd']) + tree = TreeNode.read([u"(((a,b)f, c),d)r;"]) + exp_table = pd.DataFrame([[1, 0, 0], + [4, 2, 3], + [3, 5, 5], + [1, 0, 0]], + index=['s1', 's2', 's3', 's4'], + columns=['d', 'a', 'b']) + exp_tree = TreeNode.read([u"(d,(a,b)f)r;"]) + res_table, res_tree = match_tips(table, tree, intersect=True) + pdt.assert_frame_equal(exp_table, res_table) + self.assertEqual(str(exp_tree), str(res_tree)) + + def test_match_tips_mismatch(self): + # table has less columns than tree tips + table = pd.DataFrame([[0, 0, 1], + [2, 3, 4], + [5, 5, 3], + [0, 0, 1]], + index=['s1', 's2', 's3', 's4'], + columns=['a', 'b', 'd']) + tree = TreeNode.read([u"(((a,b)f, c),d)r;"]) + with self.assertRaises(ValueError): + match_tips(table, tree) + + table = pd.DataFrame([[0, 0, 1, 1], + [2, 3, 4, 4], + [5, 5, 3, 3], + [0, 0, 0, 1]], + index=['s1', 's2', 's3', 's4'], + columns=['a', 'b', 'c', 'd']) + tree = TreeNode.read([u"((a,b)f,d)r;"]) + with self.assertRaises(ValueError): + match_tips(table, tree) + + + def test_rename_tips(self): + tree = TreeNode.read([u"(((a,b), c),d)r;"]) + exp_tree = TreeNode.read([u"(((a,b)y2, c)y1,d)y0;"]) + res_tree = rename_tips(tree) + self.assertEqual(str(exp_tree), str(res_tree)) + + + def test_rename_tips_names(self): + tree = TreeNode.read([u"(((a,b), c),d)r;"]) + exp_tree = TreeNode.read([u"(((a,b)ab, c)abc,d)r;"]) + res_tree = rename_tips(tree, ['r', 'abc', 'ab']) + self.assertEqual(str(exp_tree), str(res_tree)) + +if __name__ == '__main__': + unittest.main() diff --git a/gneiss/util.py b/gneiss/util.py new file mode 100644 index 0000000..77cb390 --- /dev/null +++ b/gneiss/util.py @@ -0,0 +1,121 @@ +import pandas as pd +import numpy as np + + +def match(x, y, intersect=False): + """ Sorts samples in metadata and contingency table in the same order. + + Parameters + ---------- + x : pd.DataFrame + Contingency table where samples correspond to rows and + features correspond to columns. + y: pd.DataFrame + Metadata table where samples correspond to rows and + explanatory metadata variables correspond to columns. + intersect : bool, optional + Specifies if only the intersection of samples in the + contingency table and the metadata table will returned. + + Returns + ------- + _x : pd.DataFrame + Filtered dataframe + _y : pd.DataFrame + Filtered dataframe + """ + _x = x.sort_index() + _y = y.sort_index() + if intersect: + idx = set(_x.index) & set(_y.index) + idx = sorted(idx) + return _x.loc[idx], _y.loc[idx] + else: + if len(_x.index) != len(_y.index): + raise ValueError("`x` and `y` have incompatible sizes, " + "`x` has %d rows, `y` has %d rows. " + "Consider setting `intersect=True`." % + (len(_x.index), len(_y.index))) + return _x, _y + + +def match_tips(table, tree, intersect=False): + """ Returns the OTU table and tree with matched tips. + + Sorts the columns of the OTU table to match the tips in + the tree. If the tree is multi-furcating, then the + tree is reduced to a bifurcating tree by randomly inserting + internal nodes. + + + Parameters + ---------- + table : pd.DataFrame + Contingency table where samples correspond to rows and + features correspond to columns. + tree : skbio.TreeNode + Tree object where the leafs correspond to the features. + intersect : bool, optional + Specifies if only the intersection of samples in the + contingency table and the tree will returned. + + Returns + ------- + pd.DataFrame : + Subset of the original contingency table with the common features. + skbio.TreeNode : + Sub-tree with the common features. + """ + tips = [x.name for x in tree.tips()] + common_tips = list(set(tips) & set(table.columns)) + + if intersect: + _table = table.loc[:, common_tips] + _tree = tree.shear(names=common_tips) + else: + if len(tips) != len(table.columns): + raise ValueError("`table` and `tree` have incompatible sizes, " + "`table` has %d columns, `tree` has %d tips. " + "Consider setting `intersect=True`." % + (len(table.columns), len(tips))) + + _table = table + _tree = tree + + _tree.bifurcate() + _tree.prune() + sorted_features = [n.name for n in _tree.tips()] + _table = _table.reindex_axis(sorted_features, axis=1) + return _table, _tree + + +def rename_tips(tree, names=None): + """ Names the tree tips according to level ordering. + + The tree will be traversed from top-down, left to right. + If there `names` is not specified, the node with the smallest label (y0) + will be located at the root of the tree, and the node with the largest + label will be located at bottom right corner of the tree. + + Parameters + ---------- + tree : skbio.TreeNode + Tree object where the leafs correspond to the features. + names : list, optional + List of labels to rename the tip names. It is assumed that the + names are listed in level ordering. + + Returns + ------- + skbio.TreeNode + Tree with renamed internal nodes. + """ + i = 0 + for n in tree.levelorder(): + if not n.is_tip(): + if names is None: + n.name = 'y%i' % i + else: + n.name = names[i] + i+=1 + return tree diff --git a/setup.py b/setup.py index 0f081de..28a305e 100644 --- a/setup.py +++ b/setup.py @@ -35,7 +35,7 @@ def finalize_options(self): extensions = cythonize(extensions) classes = """ - Development Status :: 0 - pre-alpha + Development Status :: 2 - Pre-Alpha License :: OSI Approved :: BSD License Topic :: Software Development :: Libraries Topic :: Scientific/Engineering From a1b9cefd516537c97cb4c2e1ff6ccd1aa219efcb Mon Sep 17 00:00:00 2001 From: Jamie Morton Date: Mon, 18 Jul 2016 17:45:16 -0700 Subject: [PATCH 02/12] STY: pep8 --- gneiss/tests/test_util.py | 36 ++++++++++++++++-------------------- gneiss/util.py | 2 +- 2 files changed, 17 insertions(+), 21 deletions(-) diff --git a/gneiss/tests/test_util.py b/gneiss/tests/test_util.py index df6a6ee..542805b 100644 --- a/gneiss/tests/test_util.py +++ b/gneiss/tests/test_util.py @@ -25,7 +25,6 @@ def test_match(self): pdt.assert_frame_equal(exp_table, res_table) pdt.assert_frame_equal(exp_metadata, res_metadata) - def test_match_scrambled(self): table = pd.DataFrame([[0, 0, 1, 1], [2, 2, 4, 4], @@ -40,12 +39,12 @@ def test_match_scrambled(self): index=['s1', 's3', 's2', 's4'], columns=['Barcode', 'Treatment']) exp_table = table - exp_metadata = pd.DataFrame([['a', 'control'], - ['b', 'control'], - ['c', 'diseased'], - ['d', 'diseased']], - index=['s1', 's2', 's3', 's4'], - columns=['Barcode', 'Treatment']) + exp_metadata = pd.DataFrame([['a', 'control'], + ['b', 'control'], + ['c', 'diseased'], + ['d', 'diseased']], + index=['s1', 's2', 's3', 's4'], + columns=['Barcode', 'Treatment']) res_table, res_metadata = match(table, metadata) pdt.assert_frame_equal(exp_table, res_table) @@ -70,11 +69,11 @@ def test_match_intersect(self): index=['s1', 's2', 's3'], columns=['o1', 'o2', 'o3', 'o4']) - exp_metadata = pd.DataFrame([['a', 'control'], - ['b', 'control'], - ['c', 'diseased']], - index=['s1', 's2', 's3'], - columns=['Barcode', 'Treatment']) + exp_metadata = pd.DataFrame([['a', 'control'], + ['b', 'control'], + ['c', 'diseased']], + index=['s1', 's2', 's3'], + columns=['Barcode', 'Treatment']) res_table, res_metadata = match(table, metadata, intersect=True) pdt.assert_frame_equal(exp_table, res_table) @@ -99,11 +98,11 @@ def test_match_mismatch(self): index=['s1', 's2', 's3'], columns=['o1', 'o2', 'o3', 'o4']) - exp_metadata = pd.DataFrame([['a', 'control'], - ['b', 'control'], - ['c', 'diseased']], - index=['s1', 's2', 's3'], - columns=['Barcode', 'Treatment']) + exp_metadata = pd.DataFrame([['a', 'control'], + ['b', 'control'], + ['c', 'diseased']], + index=['s1', 's2', 's3'], + columns=['Barcode', 'Treatment']) with self.assertRaises(ValueError): match(table, metadata) @@ -160,7 +159,6 @@ def test_match_tips_scrambled_columns(self): pdt.assert_frame_equal(exp_table, res_table) self.assertEqual(str(exp_tree), str(res_tree)) - def test_match_tips_intersect_tips(self): # there are less tree tips than table columns table = pd.DataFrame([[0, 0, 1, 1], @@ -223,14 +221,12 @@ def test_match_tips_mismatch(self): with self.assertRaises(ValueError): match_tips(table, tree) - def test_rename_tips(self): tree = TreeNode.read([u"(((a,b), c),d)r;"]) exp_tree = TreeNode.read([u"(((a,b)y2, c)y1,d)y0;"]) res_tree = rename_tips(tree) self.assertEqual(str(exp_tree), str(res_tree)) - def test_rename_tips_names(self): tree = TreeNode.read([u"(((a,b), c),d)r;"]) exp_tree = TreeNode.read([u"(((a,b)ab, c)abc,d)r;"]) diff --git a/gneiss/util.py b/gneiss/util.py index 77cb390..c9ee63f 100644 --- a/gneiss/util.py +++ b/gneiss/util.py @@ -117,5 +117,5 @@ def rename_tips(tree, names=None): n.name = 'y%i' % i else: n.name = names[i] - i+=1 + i += 1 return tree From 2908c93714b75171ebbf9a9ee2f30f36bbd052d2 Mon Sep 17 00:00:00 2001 From: Jamie Morton Date: Mon, 18 Jul 2016 17:48:14 -0700 Subject: [PATCH 03/12] STY: Flake8 to the rescue --- gneiss/tests/test_util.py | 12 ------------ gneiss/util.py | 4 ---- 2 files changed, 16 deletions(-) diff --git a/gneiss/tests/test_util.py b/gneiss/tests/test_util.py index 542805b..4db5eba 100644 --- a/gneiss/tests/test_util.py +++ b/gneiss/tests/test_util.py @@ -91,18 +91,6 @@ def test_match_mismatch(self): ['b', 'control']], index=['s1', 's3', 's2'], columns=['Barcode', 'Treatment']) - - exp_table = pd.DataFrame([[0, 0, 1, 1], - [2, 2, 4, 4], - [5, 5, 3, 3]], - index=['s1', 's2', 's3'], - columns=['o1', 'o2', 'o3', 'o4']) - - exp_metadata = pd.DataFrame([['a', 'control'], - ['b', 'control'], - ['c', 'diseased']], - index=['s1', 's2', 's3'], - columns=['Barcode', 'Treatment']) with self.assertRaises(ValueError): match(table, metadata) diff --git a/gneiss/util.py b/gneiss/util.py index c9ee63f..bda70d5 100644 --- a/gneiss/util.py +++ b/gneiss/util.py @@ -1,7 +1,3 @@ -import pandas as pd -import numpy as np - - def match(x, y, intersect=False): """ Sorts samples in metadata and contingency table in the same order. From e2e2e635ed43420020f7af799a85af7d111a2418 Mon Sep 17 00:00:00 2001 From: Jamie Morton Date: Tue, 19 Jul 2016 10:13:23 -0700 Subject: [PATCH 04/12] STY: Addressing @antgonza and @josenavas comments Adding tests for duplicate ids, updating documentation. Adding headers for copyright --- gneiss/balances.py | 8 ++++ gneiss/layouts.py | 8 ++++ gneiss/tests/test_balances.py | 8 ++++ gneiss/tests/test_util.py | 46 +++++++++++++++++++ gneiss/util.py | 86 +++++++++++++++++++++++++---------- 5 files changed, 132 insertions(+), 24 deletions(-) diff --git a/gneiss/balances.py b/gneiss/balances.py index 516575d..3f44749 100644 --- a/gneiss/balances.py +++ b/gneiss/balances.py @@ -1,3 +1,11 @@ +# ---------------------------------------------------------------------------- +# Copyright (c) 2016--, gneiss development team. +# +# Distributed under the terms of the Modified BSD License. +# +# The full license is in the file COPYING.txt, distributed with this software. +# ---------------------------------------------------------------------------- + from __future__ import division import numpy as np import pandas as pd diff --git a/gneiss/layouts.py b/gneiss/layouts.py index 56d88ad..a831cba 100644 --- a/gneiss/layouts.py +++ b/gneiss/layouts.py @@ -1,3 +1,11 @@ +# ---------------------------------------------------------------------------- +# Copyright (c) 2016--, gneiss development team. +# +# Distributed under the terms of the Modified BSD License. +# +# The full license is in the file COPYING.txt, distributed with this software. +# ---------------------------------------------------------------------------- + from ete3 import faces, AttrFace, CircleFace, BarChartFace diff --git a/gneiss/tests/test_balances.py b/gneiss/tests/test_balances.py index 59f2435..3af8262 100644 --- a/gneiss/tests/test_balances.py +++ b/gneiss/tests/test_balances.py @@ -1,3 +1,11 @@ +# ---------------------------------------------------------------------------- +# Copyright (c) 2016--, gneiss development team. +# +# Distributed under the terms of the Modified BSD License. +# +# The full license is in the file COPYING.txt, distributed with this software. +# ---------------------------------------------------------------------------- + from __future__ import absolute_import, division, print_function import unittest import numpy as np diff --git a/gneiss/tests/test_util.py b/gneiss/tests/test_util.py index 4db5eba..597651a 100644 --- a/gneiss/tests/test_util.py +++ b/gneiss/tests/test_util.py @@ -1,3 +1,11 @@ +# ---------------------------------------------------------------------------- +# Copyright (c) 2016--, gneiss development team. +# +# Distributed under the terms of the Modified BSD License. +# +# The full license is in the file COPYING.txt, distributed with this software. +# ---------------------------------------------------------------------------- + import unittest import pandas as pd import pandas.util.testing as pdt @@ -25,6 +33,38 @@ def test_match(self): pdt.assert_frame_equal(exp_table, res_table) pdt.assert_frame_equal(exp_metadata, res_metadata) + def test_match_duplicate(self): + table1 = pd.DataFrame([[0, 0, 1, 1], + [2, 2, 4, 4], + [5, 5, 3, 3], + [0, 0, 0, 1]], + index=['s2', 's2', 's3', 's4'], + columns=['o1', 'o2', 'o3', 'o4']) + metadata1 = pd.DataFrame([['a', 'control'], + ['b', 'control'], + ['c', 'diseased'], + ['d', 'diseased']], + index=['s1', 's2', 's3', 's4'], + columns=['Barcode', 'Treatment']) + + table2 = pd.DataFrame([[0, 0, 1, 1], + [2, 2, 4, 4], + [5, 5, 3, 3], + [0, 0, 0, 1]], + index=['s1', 's2', 's3', 's4'], + columns=['o1', 'o2', 'o3', 'o4']) + metadata2 = pd.DataFrame([['a', 'control'], + ['b', 'control'], + ['c', 'diseased'], + ['d', 'diseased']], + index=['s1', 's1', 's3', 's4'], + columns=['Barcode', 'Treatment']) + + with self.assertRaises(ValueError): + match(table1, metadata1) + with self.assertRaises(ValueError): + match(table2, metadata2) + def test_match_scrambled(self): table = pd.DataFrame([[0, 0, 1, 1], [2, 2, 4, 4], @@ -221,5 +261,11 @@ def test_rename_tips_names(self): res_tree = rename_tips(tree, ['r', 'abc', 'ab']) self.assertEqual(str(exp_tree), str(res_tree)) + def test_rename_tips_names_mismatch(self): + tree = TreeNode.read([u"(((a,b), c),d)r;"]) + exp_tree = TreeNode.read([u"(((a,b)ab, c)abc,d)r;"]) + with self.assertRaises(IndexError): + rename_tips(tree, ['r', 'abc']) + if __name__ == '__main__': unittest.main() diff --git a/gneiss/util.py b/gneiss/util.py index bda70d5..fed5e20 100644 --- a/gneiss/util.py +++ b/gneiss/util.py @@ -1,48 +1,74 @@ -def match(x, y, intersect=False): +# ---------------------------------------------------------------------------- +# Copyright (c) 2016--, gneiss development team. +# +# Distributed under the terms of the Modified BSD License. +# +# The full license is in the file COPYING.txt, distributed with this software. +# ---------------------------------------------------------------------------- + +def match(table, metadata, intersect=False): """ Sorts samples in metadata and contingency table in the same order. Parameters ---------- - x : pd.DataFrame + table : pd.DataFrame Contingency table where samples correspond to rows and features correspond to columns. - y: pd.DataFrame + metadata: pd.DataFrame Metadata table where samples correspond to rows and explanatory metadata variables correspond to columns. intersect : bool, optional Specifies if only the intersection of samples in the contingency table and the metadata table will returned. + By default, this is False. Returns ------- - _x : pd.DataFrame - Filtered dataframe - _y : pd.DataFrame - Filtered dataframe + pd.DataFrame : + Filtered contingency table. + pd.DataFrame : + Filtered metadata table + + Raises + ------ + ValueError: + Raised if duplicate sample ids are present in `table`. + ValueError: + Raised if duplicate sample ids are present in `metadata`. + ValueError: + Raised if `table` and `metadata` have incompatible sizes. """ - _x = x.sort_index() - _y = y.sort_index() + subtableids = set(table.index) + submetadataids = set(metadata.index) + if len(subtableids) != len(table.index): + raise ValueError("`table` has duplicate sample ids.") + if len(submetadataids) != len(metadata.index): + raise ValueError("`metadata` has duplicate sample ids.") + if intersect: - idx = set(_x.index) & set(_y.index) + idx = subtableids & submetadataids idx = sorted(idx) - return _x.loc[idx], _y.loc[idx] + return table.loc[idx], metadata.loc[idx] else: - if len(_x.index) != len(_y.index): - raise ValueError("`x` and `y` have incompatible sizes, " - "`x` has %d rows, `y` has %d rows. " + subtable = table.sort_index() + submetadata = metadata.sort_index() + + if len(subtable.index) != len(submetadata.index): + raise ValueError("`table` and `metadata` have incompatible sizes, " + "`table` has %d rows, `metadata` has %d rows. " "Consider setting `intersect=True`." % - (len(_x.index), len(_y.index))) - return _x, _y + (len(subtable.index), len(submetadata.index))) + return subtable, submetadata def match_tips(table, tree, intersect=False): - """ Returns the OTU table and tree with matched tips. + """ Returns the contingency table and tree with matched tips. - Sorts the columns of the OTU table to match the tips in - the tree. If the tree is multi-furcating, then the - tree is reduced to a bifurcating tree by randomly inserting - internal nodes. + Sorts the columns of the contingency table to match the tips in + the tree. The ordering of the tips is in post-traversal order. + If the tree is multi-furcating, then the tree is reduced to a + bifurcating tree by randomly inserting internal nodes. Parameters ---------- @@ -54,6 +80,7 @@ def match_tips(table, tree, intersect=False): intersect : bool, optional Specifies if only the intersection of samples in the contingency table and the tree will returned. + By default, this is False. Returns ------- @@ -61,6 +88,16 @@ def match_tips(table, tree, intersect=False): Subset of the original contingency table with the common features. skbio.TreeNode : Sub-tree with the common features. + + Raises + ------ + ValueError: + Raised if `table` and `tree` have incompatible sizes. + + See Also + -------- + skbio.TreeNode.bifurcate + skbio.TreeNode.tips """ tips = [x.name for x in tree.tips()] common_tips = list(set(tips) & set(table.columns)) @@ -88,8 +125,8 @@ def match_tips(table, tree, intersect=False): def rename_tips(tree, names=None): """ Names the tree tips according to level ordering. - The tree will be traversed from top-down, left to right. - If there `names` is not specified, the node with the smallest label (y0) + The tree will be traversed in level order (i.e. top-down, left to right). + If `names` is not specified, the node with the smallest label (y0) will be located at the root of the tree, and the node with the largest label will be located at bottom right corner of the tree. @@ -99,7 +136,8 @@ def rename_tips(tree, names=None): Tree object where the leafs correspond to the features. names : list, optional List of labels to rename the tip names. It is assumed that the - names are listed in level ordering. + names are listed in level ordering, and the length of the list + is at least as long as the number of internal nodes. Returns ------- From d6a037b3addde15ae19176ff10f8edd799adedc4 Mon Sep 17 00:00:00 2001 From: Jamie Morton Date: Tue, 19 Jul 2016 10:15:27 -0700 Subject: [PATCH 05/12] STY: pep8 flake8 --- gneiss/tests/test_util.py | 17 ++++++++--------- gneiss/util.py | 3 ++- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/gneiss/tests/test_util.py b/gneiss/tests/test_util.py index 597651a..08da7e6 100644 --- a/gneiss/tests/test_util.py +++ b/gneiss/tests/test_util.py @@ -38,27 +38,27 @@ def test_match_duplicate(self): [2, 2, 4, 4], [5, 5, 3, 3], [0, 0, 0, 1]], - index=['s2', 's2', 's3', 's4'], - columns=['o1', 'o2', 'o3', 'o4']) + index=['s2', 's2', 's3', 's4'], + columns=['o1', 'o2', 'o3', 'o4']) metadata1 = pd.DataFrame([['a', 'control'], ['b', 'control'], ['c', 'diseased'], ['d', 'diseased']], - index=['s1', 's2', 's3', 's4'], - columns=['Barcode', 'Treatment']) + index=['s1', 's2', 's3', 's4'], + columns=['Barcode', 'Treatment']) table2 = pd.DataFrame([[0, 0, 1, 1], [2, 2, 4, 4], [5, 5, 3, 3], [0, 0, 0, 1]], - index=['s1', 's2', 's3', 's4'], - columns=['o1', 'o2', 'o3', 'o4']) + index=['s1', 's2', 's3', 's4'], + columns=['o1', 'o2', 'o3', 'o4']) metadata2 = pd.DataFrame([['a', 'control'], ['b', 'control'], ['c', 'diseased'], ['d', 'diseased']], - index=['s1', 's1', 's3', 's4'], - columns=['Barcode', 'Treatment']) + index=['s1', 's1', 's3', 's4'], + columns=['Barcode', 'Treatment']) with self.assertRaises(ValueError): match(table1, metadata1) @@ -263,7 +263,6 @@ def test_rename_tips_names(self): def test_rename_tips_names_mismatch(self): tree = TreeNode.read([u"(((a,b), c),d)r;"]) - exp_tree = TreeNode.read([u"(((a,b)ab, c)abc,d)r;"]) with self.assertRaises(IndexError): rename_tips(tree, ['r', 'abc']) diff --git a/gneiss/util.py b/gneiss/util.py index fed5e20..85a2106 100644 --- a/gneiss/util.py +++ b/gneiss/util.py @@ -6,6 +6,7 @@ # The full license is in the file COPYING.txt, distributed with this software. # ---------------------------------------------------------------------------- + def match(table, metadata, intersect=False): """ Sorts samples in metadata and contingency table in the same order. @@ -46,7 +47,7 @@ def match(table, metadata, intersect=False): raise ValueError("`metadata` has duplicate sample ids.") if intersect: - idx = subtableids & submetadataids + idx = subtableids & submetadataids idx = sorted(idx) return table.loc[idx], metadata.loc[idx] else: From d19760519392180a31a34c6875458e1765d98d18 Mon Sep 17 00:00:00 2001 From: Jamie Morton Date: Tue, 19 Jul 2016 15:43:46 -0700 Subject: [PATCH 06/12] DOC: Changing method name Adding some mutability tests Adding warning about replacing internal node names --- gneiss/tests/test_util.py | 71 ++++++++++++++++++++++++++++++++++----- gneiss/util.py | 31 +++++++++++++---- 2 files changed, 86 insertions(+), 16 deletions(-) diff --git a/gneiss/tests/test_util.py b/gneiss/tests/test_util.py index 08da7e6..1255de1 100644 --- a/gneiss/tests/test_util.py +++ b/gneiss/tests/test_util.py @@ -1,7 +1,7 @@ # ---------------------------------------------------------------------------- # Copyright (c) 2016--, gneiss development team. # -# Distributed under the terms of the Modified BSD License. +# Distributed under the terms of the GPLv3 License. # # The full license is in the file COPYING.txt, distributed with this software. # ---------------------------------------------------------------------------- @@ -10,7 +10,7 @@ import pandas as pd import pandas.util.testing as pdt from skbio import TreeNode -from gneiss.util import match, match_tips, rename_tips +from gneiss.util import match, match_tips, rename_internal_nodes class TestUtil(unittest.TestCase): @@ -33,6 +33,35 @@ def test_match(self): pdt.assert_frame_equal(exp_table, res_table) pdt.assert_frame_equal(exp_metadata, res_metadata) + def test_match_immutable(self): + # tests to make sure that the original tables don't change. + table = pd.DataFrame([[0, 0, 1, 1], + [2, 2, 4, 4], + [5, 5, 3, 3], + [0, 0, 0, 1]], + index=['s1', 's2', 's3', 's4'], + columns=['o1', 'o2', 'o3', 'o4']) + metadata = pd.DataFrame([['a', 'control'], + ['c', 'diseased'], + ['b', 'control']], + index=['s1', 's3', 's2'], + columns=['Barcode', 'Treatment']) + + exp_table = pd.DataFrame([[0, 0, 1, 1], + [2, 2, 4, 4], + [5, 5, 3, 3], + [0, 0, 0, 1]], + index=['s1', 's2', 's3', 's4'], + columns=['o1', 'o2', 'o3', 'o4']) + exp_metadata = pd.DataFrame([['a', 'control'], + ['c', 'diseased'], + ['b', 'control']], + index=['s1', 's3', 's2'], + columns=['Barcode', 'Treatment']) + match(table, metadata, intersect=True) + pdt.assert_frame_equal(table, exp_table) + pdt.assert_frame_equal(metadata, exp_metadata) + def test_match_duplicate(self): table1 = pd.DataFrame([[0, 0, 1, 1], [2, 2, 4, 4], @@ -227,6 +256,19 @@ def test_match_tips_intersect_columns(self): pdt.assert_frame_equal(exp_table, res_table) self.assertEqual(str(exp_tree), str(res_tree)) + def test_match_tips_intersect_tree_immutable(self): + # tests to see if tree chnages. + table = pd.DataFrame([[0, 0, 1], + [2, 3, 4], + [5, 5, 3], + [0, 0, 1]], + index=['s1', 's2', 's3', 's4'], + columns=['a', 'b', 'd']) + tree = TreeNode.read([u"(((a,b)f, c),d)r;"]) + match_tips(table, tree, intersect=True) + self.assertEqual(str(tree), u"(((a,b)f,c),d)r;\n") + + def test_match_tips_mismatch(self): # table has less columns than tree tips table = pd.DataFrame([[0, 0, 1], @@ -249,22 +291,33 @@ def test_match_tips_mismatch(self): with self.assertRaises(ValueError): match_tips(table, tree) - def test_rename_tips(self): + def test_rename_internal_nodes(self): tree = TreeNode.read([u"(((a,b), c),d)r;"]) exp_tree = TreeNode.read([u"(((a,b)y2, c)y1,d)y0;"]) - res_tree = rename_tips(tree) + res_tree = rename_internal_nodes(tree) self.assertEqual(str(exp_tree), str(res_tree)) - def test_rename_tips_names(self): + def test_rename_internal_nodes_names(self): tree = TreeNode.read([u"(((a,b), c),d)r;"]) exp_tree = TreeNode.read([u"(((a,b)ab, c)abc,d)r;"]) - res_tree = rename_tips(tree, ['r', 'abc', 'ab']) + res_tree = rename_internal_nodes(tree, ['r', 'abc', 'ab']) self.assertEqual(str(exp_tree), str(res_tree)) - def test_rename_tips_names_mismatch(self): + def test_rename_internal_nodes_names_mismatch(self): tree = TreeNode.read([u"(((a,b), c),d)r;"]) - with self.assertRaises(IndexError): - rename_tips(tree, ['r', 'abc']) + with self.assertRaises(ValueError): + rename_internal_nodes(tree, ['r', 'abc']) + + def test_rename_internal_nodes(self): + tree = TreeNode.read([u"(((a,b)y2, c),d)r;"]) + with self.assertWarns(Warning): + rename_internal_nodes(tree) + + def test_rename_internal_nodes_immutable(self): + tree = TreeNode.read([u"(((a,b)y2, c),d)r;"]) + rename_internal_nodes(tree) + self.assertEqual(str(tree), "(((a,b)y2,c),d)r;\n") + if __name__ == '__main__': unittest.main() diff --git a/gneiss/util.py b/gneiss/util.py index 85a2106..5674aa1 100644 --- a/gneiss/util.py +++ b/gneiss/util.py @@ -1,10 +1,12 @@ # ---------------------------------------------------------------------------- # Copyright (c) 2016--, gneiss development team. # -# Distributed under the terms of the Modified BSD License. +# Distributed under the terms of the GPLv3 License. # # The full license is in the file COPYING.txt, distributed with this software. # ---------------------------------------------------------------------------- +import warnings +import copy def match(table, metadata, intersect=False): @@ -123,8 +125,8 @@ def match_tips(table, tree, intersect=False): return _table, _tree -def rename_tips(tree, names=None): - """ Names the tree tips according to level ordering. +def rename_internal_nodes(tree, names=None): + """ Names the internal according to level ordering. The tree will be traversed in level order (i.e. top-down, left to right). If `names` is not specified, the node with the smallest label (y0) @@ -144,13 +146,28 @@ def rename_tips(tree, names=None): ------- skbio.TreeNode Tree with renamed internal nodes. + + ValueError: + Raised if `tree` and `name` have incompatible sizes. """ + _tree = tree.copy() + non_tips = [n for n in _tree.levelorder() if not n.is_tip()] + if not names is None and len(non_tips) != len(names): + raise ValueError("`_tree` and `names` have incompatible sizes, " + "`_tree` has %d tips, `names` has %d elements." % + (len(non_tips), len(names))) + i = 0 - for n in tree.levelorder(): + for n in _tree.levelorder(): if not n.is_tip(): if names is None: - n.name = 'y%i' % i + label = 'y%i' % i else: - n.name = names[i] + label = names[i] + if not n.name is None and label == n.name: + warnings.warn("Warning. Internal node (%s) has been replaced " + "with (%s)" % (n.name, label)) + + n.name = label i += 1 - return tree + return _tree From 2ded931959028e69ef3f19e9b30ac6b9fc75c524 Mon Sep 17 00:00:00 2001 From: Jamie Morton Date: Tue, 19 Jul 2016 15:45:58 -0700 Subject: [PATCH 07/12] STY: clean up pep8/flake8 --- gneiss/tests/test_util.py | 3 +-- gneiss/util.py | 4 ++-- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/gneiss/tests/test_util.py b/gneiss/tests/test_util.py index 1255de1..ce815fe 100644 --- a/gneiss/tests/test_util.py +++ b/gneiss/tests/test_util.py @@ -268,7 +268,6 @@ def test_match_tips_intersect_tree_immutable(self): match_tips(table, tree, intersect=True) self.assertEqual(str(tree), u"(((a,b)f,c),d)r;\n") - def test_match_tips_mismatch(self): # table has less columns than tree tips table = pd.DataFrame([[0, 0, 1], @@ -308,7 +307,7 @@ def test_rename_internal_nodes_names_mismatch(self): with self.assertRaises(ValueError): rename_internal_nodes(tree, ['r', 'abc']) - def test_rename_internal_nodes(self): + def test_rename_internal_nodes_warning(self): tree = TreeNode.read([u"(((a,b)y2, c),d)r;"]) with self.assertWarns(Warning): rename_internal_nodes(tree) diff --git a/gneiss/util.py b/gneiss/util.py index 5674aa1..4aabb3c 100644 --- a/gneiss/util.py +++ b/gneiss/util.py @@ -152,7 +152,7 @@ def rename_internal_nodes(tree, names=None): """ _tree = tree.copy() non_tips = [n for n in _tree.levelorder() if not n.is_tip()] - if not names is None and len(non_tips) != len(names): + if names is not None and len(non_tips) != len(names): raise ValueError("`_tree` and `names` have incompatible sizes, " "`_tree` has %d tips, `names` has %d elements." % (len(non_tips), len(names))) @@ -164,7 +164,7 @@ def rename_internal_nodes(tree, names=None): label = 'y%i' % i else: label = names[i] - if not n.name is None and label == n.name: + if n.name is not None and label == n.name: warnings.warn("Warning. Internal node (%s) has been replaced " "with (%s)" % (n.name, label)) From 477d1967bb7ee99eb0d4d40d5225e0a62951388a Mon Sep 17 00:00:00 2001 From: Jamie Morton Date: Wed, 20 Jul 2016 10:21:08 -0700 Subject: [PATCH 08/12] DOC: Updating changelog --- CHANGELOG.md | 1 + gneiss/util.py | 1 - 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 27164b8..08e45ee 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,7 @@ ## Version 0.0.2 (changes since 0.0.2 go here) ### Features +* Adding in utility functions for handing feature tables, metadata, and trees. [#12](https://github.com/biocore/gneiss/pull/12) * Adding GPL license. ### Bug fixes diff --git a/gneiss/util.py b/gneiss/util.py index 4aabb3c..47d1aa3 100644 --- a/gneiss/util.py +++ b/gneiss/util.py @@ -6,7 +6,6 @@ # The full license is in the file COPYING.txt, distributed with this software. # ---------------------------------------------------------------------------- import warnings -import copy def match(table, metadata, intersect=False): From 371cc16d6dd6c7b2b8cd4b984b9ad11745f3c83f Mon Sep 17 00:00:00 2001 From: Jamie Morton Date: Mon, 25 Jul 2016 16:31:44 -0700 Subject: [PATCH 09/12] ENH: Adding inplace option --- gneiss/tests/test_util.py | 9 +++++++++ gneiss/util.py | 16 +++++++++++++--- 2 files changed, 22 insertions(+), 3 deletions(-) diff --git a/gneiss/tests/test_util.py b/gneiss/tests/test_util.py index ce815fe..0076d90 100644 --- a/gneiss/tests/test_util.py +++ b/gneiss/tests/test_util.py @@ -145,6 +145,10 @@ def test_match_intersect(self): columns=['Barcode', 'Treatment']) res_table, res_metadata = match(table, metadata, intersect=True) + # sort for comparison, since the match function + # scrambles the names due to hashing. + res_table = res_table.sort_index() + res_metadata = res_metadata.sort_index() pdt.assert_frame_equal(exp_table, res_table) pdt.assert_frame_equal(exp_metadata, res_metadata) @@ -317,6 +321,11 @@ def test_rename_internal_nodes_immutable(self): rename_internal_nodes(tree) self.assertEqual(str(tree), "(((a,b)y2,c),d)r;\n") + def test_rename_internal_nodes_mutable(self): + tree = TreeNode.read([u"(((a,b)y2, c),d)r;"]) + rename_internal_nodes(tree, inplace=True) + self.assertEqual(str(tree), "(((a,b)y2,c)y1,d)y0;\n") + if __name__ == '__main__': unittest.main() diff --git a/gneiss/util.py b/gneiss/util.py index 47d1aa3..a645ee3 100644 --- a/gneiss/util.py +++ b/gneiss/util.py @@ -39,6 +39,11 @@ def match(table, metadata, intersect=False): Raised if duplicate sample ids are present in `metadata`. ValueError: Raised if `table` and `metadata` have incompatible sizes. + + Note + ---- + If `intersect=True` is specified, then the rows for `table` and + `metadata` will be matched, but they will be in a random order. """ subtableids = set(table.index) submetadataids = set(metadata.index) @@ -49,7 +54,6 @@ def match(table, metadata, intersect=False): if intersect: idx = subtableids & submetadataids - idx = sorted(idx) return table.loc[idx], metadata.loc[idx] else: subtable = table.sort_index() @@ -124,7 +128,7 @@ def match_tips(table, tree, intersect=False): return _table, _tree -def rename_internal_nodes(tree, names=None): +def rename_internal_nodes(tree, names=None, inplace=False): """ Names the internal according to level ordering. The tree will be traversed in level order (i.e. top-down, left to right). @@ -140,6 +144,8 @@ def rename_internal_nodes(tree, names=None): List of labels to rename the tip names. It is assumed that the names are listed in level ordering, and the length of the list is at least as long as the number of internal nodes. + inplace : bool, optional + Specifies if the operation should be done on the original tree or not. Returns ------- @@ -149,7 +155,11 @@ def rename_internal_nodes(tree, names=None): ValueError: Raised if `tree` and `name` have incompatible sizes. """ - _tree = tree.copy() + if inplace: + _tree = tree + else: + _tree = tree.copy() + non_tips = [n for n in _tree.levelorder() if not n.is_tip()] if names is not None and len(non_tips) != len(names): raise ValueError("`_tree` and `names` have incompatible sizes, " From 6d70cf9f754097fc9da49e7400ac7114e831affa Mon Sep 17 00:00:00 2001 From: Jamie Morton Date: Mon, 25 Jul 2016 18:02:02 -0700 Subject: [PATCH 10/12] STY: Code dereplication --- gneiss/tests/test_util.py | 66 ++++++++++++++------------------------- gneiss/util.py | 50 +++++++++-------------------- 2 files changed, 39 insertions(+), 77 deletions(-) diff --git a/gneiss/tests/test_util.py b/gneiss/tests/test_util.py index 0076d90..fecf40f 100644 --- a/gneiss/tests/test_util.py +++ b/gneiss/tests/test_util.py @@ -30,6 +30,16 @@ def test_match(self): columns=['Barcode', 'Treatment']) exp_table, exp_metadata = table, metadata res_table, res_metadata = match(table, metadata) + + # make sure that the metadata and table indeces match + pdt.assert_index_equal(res_table.index, res_metadata.index) + + res_table = res_table.sort_index() + exp_table = exp_table.sort_index() + + res_metadata = res_metadata.sort_index() + exp_metadata = exp_metadata.sort_index() + pdt.assert_frame_equal(exp_table, res_table) pdt.assert_frame_equal(exp_metadata, res_metadata) @@ -58,7 +68,7 @@ def test_match_immutable(self): ['b', 'control']], index=['s1', 's3', 's2'], columns=['Barcode', 'Treatment']) - match(table, metadata, intersect=True) + match(table, metadata) pdt.assert_frame_equal(table, exp_table) pdt.assert_frame_equal(metadata, exp_metadata) @@ -116,6 +126,15 @@ def test_match_scrambled(self): columns=['Barcode', 'Treatment']) res_table, res_metadata = match(table, metadata) + # make sure that the metadata and table indeces match + pdt.assert_index_equal(res_table.index, res_metadata.index) + + res_table = res_table.sort_index() + exp_table = exp_table.sort_index() + + res_metadata = res_metadata.sort_index() + exp_metadata = exp_metadata.sort_index() + pdt.assert_frame_equal(exp_table, res_table) pdt.assert_frame_equal(exp_metadata, res_metadata) @@ -144,7 +163,7 @@ def test_match_intersect(self): index=['s1', 's2', 's3'], columns=['Barcode', 'Treatment']) - res_table, res_metadata = match(table, metadata, intersect=True) + res_table, res_metadata = match(table, metadata) # sort for comparison, since the match function # scrambles the names due to hashing. res_table = res_table.sort_index() @@ -152,21 +171,6 @@ def test_match_intersect(self): pdt.assert_frame_equal(exp_table, res_table) pdt.assert_frame_equal(exp_metadata, res_metadata) - def test_match_mismatch(self): - table = pd.DataFrame([[0, 0, 1, 1], - [2, 2, 4, 4], - [5, 5, 3, 3], - [0, 0, 0, 1]], - index=['s1', 's2', 's3', 's4'], - columns=['o1', 'o2', 'o3', 'o4']) - metadata = pd.DataFrame([['a', 'control'], - ['c', 'diseased'], - ['b', 'control']], - index=['s1', 's3', 's2'], - columns=['Barcode', 'Treatment']) - with self.assertRaises(ValueError): - match(table, metadata) - def test_match_tips(self): table = pd.DataFrame([[0, 0, 1, 1], [2, 2, 4, 4], @@ -236,7 +240,7 @@ def test_match_tips_intersect_tips(self): index=['s1', 's2', 's3', 's4'], columns=['a', 'b', 'd']) exp_tree = tree - res_table, res_tree = match_tips(table, tree, intersect=True) + res_table, res_tree = match_tips(table, tree) pdt.assert_frame_equal(exp_table, res_table) self.assertEqual(str(exp_tree), str(res_tree)) @@ -256,7 +260,7 @@ def test_match_tips_intersect_columns(self): index=['s1', 's2', 's3', 's4'], columns=['d', 'a', 'b']) exp_tree = TreeNode.read([u"(d,(a,b)f)r;"]) - res_table, res_tree = match_tips(table, tree, intersect=True) + res_table, res_tree = match_tips(table, tree) pdt.assert_frame_equal(exp_table, res_table) self.assertEqual(str(exp_tree), str(res_tree)) @@ -269,31 +273,9 @@ def test_match_tips_intersect_tree_immutable(self): index=['s1', 's2', 's3', 's4'], columns=['a', 'b', 'd']) tree = TreeNode.read([u"(((a,b)f, c),d)r;"]) - match_tips(table, tree, intersect=True) + match_tips(table, tree) self.assertEqual(str(tree), u"(((a,b)f,c),d)r;\n") - def test_match_tips_mismatch(self): - # table has less columns than tree tips - table = pd.DataFrame([[0, 0, 1], - [2, 3, 4], - [5, 5, 3], - [0, 0, 1]], - index=['s1', 's2', 's3', 's4'], - columns=['a', 'b', 'd']) - tree = TreeNode.read([u"(((a,b)f, c),d)r;"]) - with self.assertRaises(ValueError): - match_tips(table, tree) - - table = pd.DataFrame([[0, 0, 1, 1], - [2, 3, 4, 4], - [5, 5, 3, 3], - [0, 0, 0, 1]], - index=['s1', 's2', 's3', 's4'], - columns=['a', 'b', 'c', 'd']) - tree = TreeNode.read([u"((a,b)f,d)r;"]) - with self.assertRaises(ValueError): - match_tips(table, tree) - def test_rename_internal_nodes(self): tree = TreeNode.read([u"(((a,b), c),d)r;"]) exp_tree = TreeNode.read([u"(((a,b)y2, c)y1,d)y0;"]) diff --git a/gneiss/util.py b/gneiss/util.py index a645ee3..6958ab7 100644 --- a/gneiss/util.py +++ b/gneiss/util.py @@ -8,9 +8,12 @@ import warnings -def match(table, metadata, intersect=False): +def match(table, metadata): """ Sorts samples in metadata and contingency table in the same order. + The intersection of samples in the contingency table and the metadata table + will returned. + Parameters ---------- table : pd.DataFrame @@ -19,10 +22,6 @@ def match(table, metadata, intersect=False): metadata: pd.DataFrame Metadata table where samples correspond to rows and explanatory metadata variables correspond to columns. - intersect : bool, optional - Specifies if only the intersection of samples in the - contingency table and the metadata table will returned. - By default, this is False. Returns ------- @@ -52,22 +51,13 @@ def match(table, metadata, intersect=False): if len(submetadataids) != len(metadata.index): raise ValueError("`metadata` has duplicate sample ids.") - if intersect: - idx = subtableids & submetadataids - return table.loc[idx], metadata.loc[idx] - else: - subtable = table.sort_index() - submetadata = metadata.sort_index() - - if len(subtable.index) != len(submetadata.index): - raise ValueError("`table` and `metadata` have incompatible sizes, " - "`table` has %d rows, `metadata` has %d rows. " - "Consider setting `intersect=True`." % - (len(subtable.index), len(submetadata.index))) - return subtable, submetadata + idx = subtableids & submetadataids + subtable = table.loc[idx] + submetadata = metadata.loc[idx] + return subtable, submetadata -def match_tips(table, tree, intersect=False): +def match_tips(table, tree): """ Returns the contingency table and tree with matched tips. Sorts the columns of the contingency table to match the tips in @@ -76,6 +66,9 @@ def match_tips(table, tree, intersect=False): If the tree is multi-furcating, then the tree is reduced to a bifurcating tree by randomly inserting internal nodes. + The intersection of samples in the contingency table and the + tree will returned. + Parameters ---------- table : pd.DataFrame @@ -83,10 +76,6 @@ def match_tips(table, tree, intersect=False): features correspond to columns. tree : skbio.TreeNode Tree object where the leafs correspond to the features. - intersect : bool, optional - Specifies if only the intersection of samples in the - contingency table and the tree will returned. - By default, this is False. Returns ------- @@ -108,23 +97,14 @@ def match_tips(table, tree, intersect=False): tips = [x.name for x in tree.tips()] common_tips = list(set(tips) & set(table.columns)) - if intersect: - _table = table.loc[:, common_tips] - _tree = tree.shear(names=common_tips) - else: - if len(tips) != len(table.columns): - raise ValueError("`table` and `tree` have incompatible sizes, " - "`table` has %d columns, `tree` has %d tips. " - "Consider setting `intersect=True`." % - (len(table.columns), len(tips))) - - _table = table - _tree = tree + _table = table.loc[:, common_tips] + _tree = tree.shear(names=common_tips) _tree.bifurcate() _tree.prune() sorted_features = [n.name for n in _tree.tips()] _table = _table.reindex_axis(sorted_features, axis=1) + return _table, _tree From 9003db813f82b0106ac847c4e2acb12542cdefd8 Mon Sep 17 00:00:00 2001 From: Jamie Morton Date: Mon, 25 Jul 2016 18:09:14 -0700 Subject: [PATCH 11/12] DOC: cleaning up docstrings --- gneiss/util.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/gneiss/util.py b/gneiss/util.py index 6958ab7..6df83cf 100644 --- a/gneiss/util.py +++ b/gneiss/util.py @@ -9,10 +9,12 @@ def match(table, metadata): - """ Sorts samples in metadata and contingency table in the same order. + """ Matches samples between a contingency table and a metadata table. - The intersection of samples in the contingency table and the metadata table - will returned. + Sorts samples in metadata and contingency table in the same order. + If there are sames contained in the contigency table, but not in metadata or + vice versa, the intersection of samples in the contingency table and the + metadata table will returned. Parameters ---------- @@ -39,10 +41,6 @@ def match(table, metadata): ValueError: Raised if `table` and `metadata` have incompatible sizes. - Note - ---- - If `intersect=True` is specified, then the rows for `table` and - `metadata` will be matched, but they will be in a random order. """ subtableids = set(table.index) submetadataids = set(metadata.index) @@ -132,6 +130,8 @@ def rename_internal_nodes(tree, names=None, inplace=False): skbio.TreeNode Tree with renamed internal nodes. + Raises + ------ ValueError: Raised if `tree` and `name` have incompatible sizes. """ From b049ced42527af99e485de6d2098768aaee5779e Mon Sep 17 00:00:00 2001 From: Jamie Morton Date: Mon, 25 Jul 2016 18:24:57 -0700 Subject: [PATCH 12/12] pep8 --- gneiss/util.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/gneiss/util.py b/gneiss/util.py index 6df83cf..98e7031 100644 --- a/gneiss/util.py +++ b/gneiss/util.py @@ -12,8 +12,8 @@ def match(table, metadata): """ Matches samples between a contingency table and a metadata table. Sorts samples in metadata and contingency table in the same order. - If there are sames contained in the contigency table, but not in metadata or - vice versa, the intersection of samples in the contingency table and the + If there are sames contained in the contigency table, but not in metadata + or vice versa, the intersection of samples in the contingency table and the metadata table will returned. Parameters