diff --git a/.github/workflows/python-package-conda.yml b/.github/workflows/python-package-conda.yml index c0cb58d1..bf370e99 100644 --- a/.github/workflows/python-package-conda.yml +++ b/.github/workflows/python-package-conda.yml @@ -1,10 +1,10 @@ name: biom-format CI -on: +on: push: branches: [ master ] pull_request: - + jobs: lint: runs-on: ubuntu-latest @@ -20,7 +20,7 @@ jobs: run: | pip install -q flake8 flake8 biom setup.py - + docs: runs-on: ubuntu-latest steps: @@ -28,25 +28,25 @@ jobs: - uses: conda-incubator/setup-miniconda@v2 with: auto-update-conda: true - python-version: 3.7 + python-version: 3.6 - name: Install dependencies shell: bash -l {0} run: | - conda create --yes -n env_name python=3.7 + conda create --yes -n env_name python=3.6 conda activate env_name - conda install --name env_name pip click numpy "scipy>=1.3.1" pep8 flake8 coverage future six "pandas>=0.20.0" nose h5py>=2.2.0 cython + conda install --name env_name pip click numpy "scipy>=1.3.1" pep8 flake8 coverage future six "pandas>=0.20.0" nose h5py>=2.2.0 cython scikit-bio pip install sphinx==1.2.2 "docutils<0.14" pip install -e . --no-deps - - name: Build docs + - name: Build docs shell: bash -l {0} run: | conda activate env_name make -C doc html - + build: strategy: matrix: - python-version: [3.6, 3.7, 3.8] + python-version: [3.6, ] # 3.7, 3.8] os: [ubuntu-latest, macos-latest] runs-on: ${{ matrix.os }} @@ -61,15 +61,15 @@ jobs: run: | conda create --yes -n env_name python=${{ matrix.python-version }} conda activate env_name - conda install -y pip click numpy "scipy>=1.3.1" pep8 flake8 coverage future six "pandas>=0.20.0" nose h5py>=2.2.0 cython + conda install -y pip click numpy "scipy>=1.3.1" pep8 flake8 coverage future six "pandas>=0.20.0" nose h5py>=2.2.0 cython scikit-bio pip install anndata - + - name: Tests shell: bash -l {0} run: | conda activate env_name conda install -y pytest - which python + which python pip install -e . --no-deps make test biom show-install-info diff --git a/ChangeLog.md b/ChangeLog.md index 18a55b7a..46cea788 100644 --- a/ChangeLog.md +++ b/ChangeLog.md @@ -17,6 +17,10 @@ Bug fixes: * During deployment testing for QIIME 2 2020.11, it was observed that certain combinations of hdf5 or h5py dependencies can result in metadata strings parsing as ASCII rather than UTF-8. Parse of BIOM-Format 2.1.0 files now normalize metadata strings as UTF-8, see [PR #853](https://github.com/biocore/biom-format/pull/853). +New Features + +* Added support for aligning dataframes and trees against biom tables with `Table.align_to_dataframe` and `Table.align_tree`. see [PR #859](https://github.com/biocore/biom-format/pull/859) + biom 2.1.9 ---------- diff --git a/biom/table.py b/biom/table.py index a70846de..49da56b2 100644 --- a/biom/table.py +++ b/biom/table.py @@ -948,6 +948,118 @@ def _get_col(self, col_idx): self._data = self._data.tocsc() return self._data.getcol(col_idx) + def align_to_dataframe(self, metadata, axis='sample'): + """ Aligns dataframe against biom table, only keeping common ids. + + Parameters + ---------- + metadata : pd.DataFrame + The metadata, either respect to the sample metadata + or observation metadata. + axis : {'sample', 'observation'} + The axis on which to operate. + + Returns + ------- + biom.Table + A filtered biom table. + pd.DataFrame + A filtered metadata table. + + Examples + -------- + >>> from biom import Table + >>> import numpy as np + >>> import pandas as pd + >>> table = Table(np.array([[0, 0, 1, 1], + ... [2, 2, 4, 4], + ... [5, 5, 3, 3], + ... [0, 0, 0, 1]]), + ... ['o1', 'o2', 'o3', 'o4'], + ... ['s1', 's2', 's3', 's4']) + >>> metadata = pd.DataFrame([['a', 'control'], + ... ['c', 'diseased'], + ... ['b', 'control']], + ... index=['s1', 's3', 's2'], + ... columns=['Barcode', 'Treatment']) + >>> res_table, res_metadata = table.align_to_dataframe(metadata) + >>> print(res_table) + # Constructed from biom file + #OTU ID s1 s2 s3 + o1 0.0 0.0 1.0 + o2 2.0 2.0 4.0 + o3 5.0 5.0 3.0 + >>> print(res_metadata) + Barcode Treatment + s1 a control + s2 b control + s3 c diseased + """ + ids = set(self.ids(axis=axis)) & set(metadata.index) + if len(ids) == 0: + raise TableException("No common ids between table and dataframe.") + + t = self.filter(ids, axis=axis, inplace=False) + t.remove_empty() + md = metadata.loc[t.ids(axis=axis)] + return t, md + + def align_tree(self, tree, axis='observation'): + r""" Aligns biom table against tree, only keeping common ids. + + Parameters + ---------- + tree : skbio.TreeNode + The tree object, either respect to the sample metadata + or observation metadata. + axis : {'sample', 'observation'} + The axis on which to operate. + + Returns + ------- + biom.Table + A filtered biom table. + skbio.TreeNode + A filtered skbio TreeNode object. + + Examples + -------- + >>> from biom import Table + >>> import numpy as np + >>> from skbio import TreeNode + >>> table = Table(np.array([[0, 0, 1, 1], + ... [2, 2, 4, 4], + ... [5, 5, 3, 3], + ... [0, 0, 0, 1]]), + ... ['o1', 'o2', 'o3', 'o4'], + ... ['s1', 's2', 's3', 's4']) + >>> tree = TreeNode.read([u"((o1,o2)f,o3)r;"]) + >>> res_table, res_tree = table.align_tree(tree) + >>> print(res_table) + # Constructed from biom file + #OTU ID s1 s2 s3 s4 + o1 0.0 0.0 1.0 1.0 + o2 2.0 2.0 4.0 4.0 + o3 5.0 5.0 3.0 3.0 + >>> print(res_tree.ascii_art()) + /-o1 + /f-------| + -r-------| \-o2 + | + \-o3 + """ + tips = {x.name for x in tree.tips()} + common_tips = tips & set(self.ids(axis=axis)) + if len(common_tips) == 0: + raise TableException("No common ids between table and tree.") + _tree = tree.shear(names=common_tips) + _table = self.filter(common_tips, axis=axis, inplace=False) + _table.remove_empty() + _tree.prune() + order = [n.name for n in _tree.tips()] + _table = _table.sort_order(order, axis=axis) + return _table, _tree + def reduce(self, f, axis): """Reduce over axis using function `f` diff --git a/biom/tests/test_table.py b/biom/tests/test_table.py index dd0f4159..30071772 100644 --- a/biom/tests/test_table.py +++ b/biom/tests/test_table.py @@ -45,6 +45,12 @@ except ImportError: HAVE_ANNDATA = False +try: + import skbio + HAVE_SKBIO = True +except ImportError: + HAVE_SKBIO = False + __author__ = "Daniel McDonald" __copyright__ = "Copyright 2011-2017, The BIOM Format Development Team" __credits__ = ["Daniel McDonald", "Jai Ram Rideout", "Justin Kuczynski", @@ -1815,6 +1821,196 @@ def test_add_sample_metadata_two_entries(self): self.assertEqual(t._sample_metadata[2]['D'], ['A', 'C']) self.assertEqual(t._sample_metadata[3]['D'], ['A', 'D']) + def test_align_to_dataframe_samples(self): + table = Table(np.array([[0, 0, 1, 1], + [2, 2, 4, 4], + [5, 5, 3, 3], + [0, 0, 0, 1]]).T, + ['o1', 'o2', 'o3', 'o4'], + ['s1', 's2', 's3', 's4']) + metadata = pd.DataFrame([['a', 'control'], + ['c', 'diseased'], + ['b', 'control']], + index=['s1', 's3', 's2'], + columns=['Barcode', 'Treatment']) + exp_table = Table(np.array([[0, 0, 1, 1], + [2, 2, 4, 4], + [5, 5, 3, 3]]).T, + ['o1', 'o2', 'o3', 'o4'], + ['s1', 's2', 's3']) + exp_metadata = pd.DataFrame([['a', 'control'], + ['b', 'control'], + ['c', 'diseased']], + index=['s1', 's2', 's3'], + columns=['Barcode', 'Treatment']) + res_table, res_metadata = table.align_to_dataframe(metadata) + pdt.assert_frame_equal(exp_metadata, res_metadata) + self.assertEqual(res_table.descriptive_equality(exp_table), + 'Tables appear equal') + + def test_align_to_dataframe_observations(self): + table = Table(np.array([[0, 0, 1, 1], + [2, 2, 4, 4], + [5, 5, 3, 3], + [0, 0, 0, 1]]), + ['o1', 'o2', 'o3', 'o4'], + ['s1', 's2', 's3', 's4']) + metadata = pd.DataFrame([['a', 'Firmicutes'], + ['c', 'Proteobacteria'], + ['b', 'Firmicutes']], + index=['o1', 'o3', 'o2'], + columns=['Barcode', 'Treatment']) + exp_table = Table(np.array([[0, 0, 1, 1], + [2, 2, 4, 4], + [5, 5, 3, 3]]), + ['o1', 'o2', 'o3'], + ['s1', 's2', 's3', 's4']) + exp_metadata = pd.DataFrame([['a', 'Firmicutes'], + ['b', 'Firmicutes'], + ['c', 'Proteobacteria']], + index=['o1', 'o2', 'o3'], + columns=['Barcode', 'Treatment']) + res_table, res_metadata = table.align_to_dataframe( + metadata, axis='observation') + pdt.assert_frame_equal(exp_metadata, res_metadata) + self.assertEqual(res_table.descriptive_equality(exp_table), + 'Tables appear equal') + + def test_align_to_dataframe_samples_remove_empty(self): + table = Table(np.array([[0, 0, 1, 0], + [2, 2, 4, 0], + [5, 5, 3, 0], + [0, 0, 0, 1]]).T, + ['o1', 'o2', 'o3', 'o4'], + ['s1', 's2', 's3', 's4']) + metadata = pd.DataFrame([['a', 'control'], + ['c', 'diseased'], + ['b', 'control']], + index=['s1', 's3', 's2'], + columns=['Barcode', 'Treatment']) + exp_table = Table(np.array([[0, 0, 1], + [2, 2, 4], + [5, 5, 3]]).T, + ['o1', 'o2', 'o3'], + ['s1', 's2', 's3']) + exp_metadata = pd.DataFrame([['a', 'control'], + ['b', 'control'], + ['c', 'diseased']], + index=['s1', 's2', 's3'], + columns=['Barcode', 'Treatment']) + res_table, res_metadata = table.align_to_dataframe(metadata) + pdt.assert_frame_equal(exp_metadata, res_metadata) + self.assertEqual(res_table.descriptive_equality(exp_table), + 'Tables appear equal') + + def test_align_to_dataframe_samples_empty(self): + table = Table(np.array([[0, 0, 1, 0], + [2, 2, 4, 0], + [5, 5, 3, 0], + [0, 0, 0, 1]]).T, + ['o1', 'o2', 'o3', 'o4'], + ['s1', 's2', 's3', 's4']) + metadata = pd.DataFrame([['a', 'control'], + ['c', 'diseased'], + ['b', 'control']], + index=['s1', 's3', 's2'], + columns=['Barcode', 'Treatment']) + exp_table = Table(np.array([[0, 0, 1], + [2, 2, 4], + [5, 5, 3]]).T, + ['o1', 'o2', 'o3'], + ['s1', 's2', 's3']) + exp_metadata = pd.DataFrame([['a', 'control'], + ['b', 'control'], + ['c', 'diseased']], + index=['s1', 's2', 's3'], + columns=['Barcode', 'Treatment']) + res_table, res_metadata = table.align_to_dataframe(metadata) + pdt.assert_frame_equal(exp_metadata, res_metadata) + self.assertEqual(res_table.descriptive_equality(exp_table), + 'Tables appear equal') + + def test_align_to_dataframe_samples_no_common_ids(self): + table = Table(np.array([[0, 0, 1, 0], + [2, 2, 4, 0], + [5, 5, 3, 0], + [0, 0, 0, 1]]), + ['s1', 's2', 's3', 's4'], + ['o1', 'o2', 'o3', 'o4']) + metadata = pd.DataFrame([['a', 'control'], + ['c', 'diseased'], + ['b', 'control']], + index=['s1', 's3', 's2'], + columns=['Barcode', 'Treatment']) + with self.assertRaises(TableException): + table.align_to_dataframe(metadata) + + @pytest.mark.skipif(not HAVE_SKBIO, reason="skbio not installed") + def test_align_tree_intersect_tips(self): + # there are less tree tips than observations + table = Table(np.array([[0, 0, 1, 1], + [2, 3, 4, 4], + [5, 5, 3, 3], + [0, 0, 0, 1]]).T, + ['a', 'b', 'c', 'd'], + ['s1', 's2', 's3', 's4']) + tree = skbio.TreeNode.read([u"((a,b)f,d)r;"]) + exp_table = Table(np.array([[0, 0, 1], + [2, 3, 4], + [5, 5, 3], + [0, 0, 1]]).T, + ['a', 'b', 'd'], + ['s1', 's2', 's3', 's4']) + exp_tree = tree + res_table, res_tree = table.align_tree(tree) + self.assertEqual(res_table.descriptive_equality(exp_table), + 'Tables appear equal') + self.assertEqual(str(exp_tree), str(res_tree)) + + @pytest.mark.skipif(not HAVE_SKBIO, reason="skbio not installed") + def test_align_tree_intersect_obs(self): + # table has less observations than tree tips + table = Table(np.array([[0, 0, 1], + [2, 3, 4], + [5, 5, 3], + [0, 0, 1]]).T, + ['a', 'b', 'd'], + ['s1', 's2', 's3', 's4']) + tree = skbio.TreeNode.read([u"(((a,b)f, c),d)r;"]) + exp_table = Table(np.array([[1, 0, 0], + [4, 2, 3], + [3, 5, 5], + [1, 0, 0]]).T, + ['d', 'a', 'b'], + ['s1', 's2', 's3', 's4']) + exp_tree = skbio.TreeNode.read([u"(d,(a,b)f)r;"]) + res_table, res_tree = table.align_tree(tree) + self.assertEqual(res_table.descriptive_equality(exp_table), + 'Tables appear equal') + self.assertEqual(exp_tree.compare_rfd(res_tree), 0) + + @pytest.mark.skipif(not HAVE_SKBIO, reason="skbio not installed") + def test_align_tree_sample(self): + # table has less observations than tree tips + table = Table(np.array([[0, 0, 1], + [2, 3, 4], + [5, 5, 3], + [0, 0, 1]]), + ['o1', 'o2', 'o3', 'o4'], + ['s1', 's2', 's4']) + tree = skbio.TreeNode.read([u"(((s1,s2)F, s3),s4)R;"]) + exp_table = Table(np.array([[1, 0, 0], + [4, 2, 3], + [3, 5, 5], + [1, 0, 0]]), + ['o1', 'o2', 'o3', 'o4'], + ['s4', 's1', 's2']) + exp_tree = skbio.TreeNode.read([u"(s4,(s1,s2)F)R;"]) + res_table, res_tree = table.align_tree(tree, axis='sample') + self.assertEqual(res_table.descriptive_equality(exp_table), + 'Tables appear equal') + self.assertEqual(exp_tree.compare_rfd(res_tree), 0) + def test_get_value_by_ids(self): """Return the value located in the matrix by the ids""" t1 = Table(np.array([[5, 6], [7, 8]]), [3, 4], [1, 2])