Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Collapse multifurcation #5

Merged
merged 6 commits into from
Dec 22, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 12 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,17 @@
# Greengenes2 changelog

## Version 2022.4-dev
## Version 2022.12

* Added a method for collapsing by multifurcation in [#5](https://github.com/biocore/q2-greengenes2/pull/5)

## Version 2022.10

* Support for filtering a feature table against Greengenes2 by @wasade
* new plugin to compute effect sizes by @giorgianicolaou in [#2](https://github.com/biocore/q2-greengenes2/pull/2)
* Bulk characterization methods @wasade in [#3](https://github.com/biocore/q2-greengenes2/pull/3)
* Support for non V4 16S data by @wasade in [#4](https://github.com/biocore/q2-greengenes2/pull/4)

## Version 2022.4

### Features

Expand Down
2 changes: 2 additions & 0 deletions q2_gg2/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
filter_features, relabel, clade_v4_asv_assessment,
bulk_clade_v4_asv_assessment,
sequence_v4_asv_assessment,
collapse_multifurcation,
bulk_sequence_v4_asv_assessment, clade_lookup,
compute_effect_size, non_v4_16s)
from . import _version
Expand All @@ -18,4 +19,5 @@
'bulk_clade_v4_asv_assessment', 'clade_lookup',
'sequence_v4_asv_assessment',
'bulk_sequence_v4_asv_assessment',
'collapse_multifurcation',
'compute_effect_size', 'non_v4_16s']
68 changes: 68 additions & 0 deletions q2_gg2/_methods.py
Original file line number Diff line number Diff line change
Expand Up @@ -895,3 +895,71 @@ def non_v4_16s(ctx, table, sequences, backbone, perc_identity=0.99, threads=1):
perc_identity=perc_identity,
threads=threads)
return res_table, res_seqs


def collapse_multifurcation(feature_table: biom.Table,
phylogeny: NewickFormat) -> (biom.Table,
skbio.TreeNode):
regexs = [re.compile(r'^[ATGC]{90}'),
re.compile(r'^[abcdef0-9]{32}$'),
re.compile(r'^[0-9]{8}$')]

# determine which regex to use with the table
regex = None
for r in regexs:
if r.match(feature_table.ids(axis='observation')[0]):
regex = r
break

if regex is None:
raise ValueError("Could not determine ASV identifiers in the table.")

# filter the tree to what's in the table
try:
phylogeny = phylogeny.read()
except AttributeError:
phylogeny = open(str(phylogeny)).read()
phylogeny = bp.parse_newick(phylogeny)

phylogeny_tips = {phylogeny.name(i) for i in range(len(phylogeny.B) - 1)
if phylogeny.B[i] and not phylogeny.B[i+1]}
overlap = phylogeny_tips & set(feature_table.ids(axis='observation'))

# bail early if something is weird
if not overlap:
raise ValueError("No table features found in the phylogeny")

# reduce the phylogeny
phylogeny = phylogeny.shear(overlap).collapse()
phylogeny = bp.to_skbio_treenode(phylogeny)

# remark what appears to be an asv in the phylogeny
for n in phylogeny.non_tips(include_self=True):
n.is_asv = False

for n in phylogeny.tips():
if regex.match(n.name):
n.is_asv = True
else:
n.is_asv = False
n.parent.possible_multifurcation = True

# cut nodes at the multifurcation points
phylogeny.assign_ids()
collapse_map = {}
for n in list(phylogeny.non_tips()):
if hasattr(n, 'possible_multifurcation'):
if all([c.is_asv for c in n.children]):
if n.name is None:
n.name = 'multifurcation-%d' % n.id

for c in list(n.children):
n.remove(c)
c.parent = None
collapse_map[c.name] = n.name

# collapse the feature table to the multifurcation
table = feature_table.collapse(lambda i, m: collapse_map.get(i, i),
axis='observation', norm=False)
table.del_metadata()
return table, phylogeny
23 changes: 23 additions & 0 deletions q2_gg2/plugin_setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -331,4 +331,27 @@
citations=[]
)


plugin.methods.register_function(
function=q2_gg2.collapse_multifurcation,
inputs={'feature_table': FeatureTable[Frequency],
'phylogeny': Phylogeny[Rooted]},
parameters={},
outputs=[('collapsed_table', FeatureTable[Frequency]),
('collapsed_phylogeny', Phylogeny[Rooted])],
input_descriptions={
'feature_table': "The feature table to collapse",
'phylogeny': "The reference phylogeny"},
parameter_descriptions={},
output_descriptions={
'collapsed_table': 'The resulting collapsed feature table',
'collapsed_phylogeny': ('The phylogeny filtered with multifurcations '
'collapsed')},
name='Collapse features present within multifurcations',
description=("Collapse features present within multifurcations. This is "
"a phylogenetic feature space reduction technique. "),
citations=[]
)


importlib.import_module('q2_gg2._transformer')
17 changes: 17 additions & 0 deletions q2_gg2/tests/test_methods.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
_infer_feature_data_labels,
_sequence_v4_asv_assessment,
filter_features,
collapse_multifurcation,
taxonomy_from_features,
taxonomy_from_table,
relabel,
Expand All @@ -45,6 +46,22 @@ def setUp(self):
">Y\nA\n"
">Z\nA\n")

def test_collapse_multifurcation(self):
table = biom.Table(np.arange(24).reshape(6, 4),
['10000000', '20000000', '30000000',
'40000000', '50000000', '60000000'],
list('abcd'))
tree = io.StringIO("((((10000000,20000000)x,G1),(30000000,G2)y),((40000000,50000000,60000000)z,(G3,G4)));") # noqa
exp_tab = biom.Table(np.array([[4, 6, 8, 10],
[8, 9, 10, 11],
[48, 51, 54, 57]]),
['x', 30000000, 'z'],
list('abcd'))
exp_tree = skbio.TreeNode.read(["(((x,G1),(30000000,G2)y),(z,(G3,G4)));"]) # noqa
obs_tab, obs_tree = collapse_multifurcation(table, tree)
self.assertEqual(obs_tab, exp_tab)
self.assertEqual(obs_tree.compare_rfd(exp_tree), 0.)

def test_infer_feature_data_labels(self):
taxa = pd.DataFrame([['MJ007-1-barcode27-umi40bins-ubs-7010', 's__foo',
0.1],
Expand Down
10 changes: 10 additions & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,15 @@

import versioneer


install_requires = [
"biom-format",
"iow",
"redbiom",
"scikit-bio"
]


setup(
name="q2-greengenes2",
version=versioneer.get_version(),
Expand All @@ -19,4 +28,5 @@
'q2_gg2': []
},
zip_safe=False,
install_requires=install_requires,
)