Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ENH: add action to get feature lengths #164

Merged
merged 2 commits into from
May 17, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion q2_moshpit/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,12 +19,14 @@
helpers as kraken_helpers
)
from .metabat2 import metabat2
from ._utils import get_feature_lengths

__version__ = get_versions()['version']
del get_versions

__all__ = [
'metabat2', 'bracken', 'kraken_class', 'kraken_db',
'kaiju_class', 'kaiju_db', 'dereplicate_mags', 'eggnog',
'busco', 'prodigal', 'kraken_helpers', 'partition'
'busco', 'prodigal', 'kraken_helpers', 'partition',
'get_feature_lengths'
]
18 changes: 18 additions & 0 deletions q2_moshpit/_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,11 @@
import hashlib
from typing import List

import pandas as pd
import skbio

from q2_types.feature_data_mag import MAGSequencesDirFmt


def run_command(cmd, env=None, verbose=True, pipe=False, **kwargs):
if verbose:
Expand Down Expand Up @@ -84,3 +89,16 @@ def _calculate_md5_from_file(file_path: str) -> str:
for chunk in iter(lambda: f.read(4096), b""):
md5_hash.update(chunk)
return md5_hash.hexdigest()


def get_feature_lengths(features: MAGSequencesDirFmt) -> pd.DataFrame:
"""Calculate lengths of features in a feature data object."""
ids, lengths = [], []
for _id, fp in features.feature_dict().items():
sequences = skbio.io.read(fp, format='fasta', verify=False)
ids.append(_id)
lengths.append(sum(len(seq) for seq in sequences))

df = pd.DataFrame({'id': ids, 'length': lengths})
df.set_index('id', inplace=True)
return df
22 changes: 21 additions & 1 deletion q2_moshpit/plugin_setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
)
from q2_types.distance_matrix import DistanceMatrix
from q2_types.feature_data import (
FeatureData, Sequence, Taxonomy, ProteinSequence
FeatureData, Sequence, Taxonomy, ProteinSequence, SequenceCharacteristics
)
from q2_types.feature_table import FeatureTable, Frequency, PresenceAbsence
from q2_types.per_sample_sequences import (
Expand Down Expand Up @@ -1194,6 +1194,26 @@
citations=[citations["menzel2016"]],
)

plugin.methods.register_function(
function=q2_moshpit._utils.get_feature_lengths,
inputs={
"features": FeatureData[MAG],
},
parameters={},
outputs=[
('lengths',
FeatureData[SequenceCharacteristics % Properties('length')])
],
input_descriptions={
"features": "Features to get lengths for."
},
parameter_descriptions={},
output_descriptions={'lengths': 'Feature lengths.', },
name='Get feature lengths.',
description='This method extract lengths for the provided feature set.',
citations=[]
)

plugin.register_semantic_types(BUSCOResults)
plugin.register_semantic_type_to_format(
BUSCOResults,
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
>NZ_00000000.1_contig1
ATGTTTCCAGATGCAATGCGTGGGCACTG
>NZ_00000000.1_contig2
TTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGT
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
>NZ_CP007255.1_contig1
GGCGTGCCCTCA
>NZ_CP007255.1_contig2
GGAACGAACTCGGTCCAGGCGTTCTCCCACAT
>NZ_CP007255.1_contig3
ATGATCGATCCTTCCCACCAAGCTCA
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
>NZ_CP018863.1_contig1
GCCTCCTCCCAGTTCGTCTCAGCGCTGCTGCTGGTCGGCGCCAAATTCCGTAACGGGCTG
GCACCTCGAACATTCCGGCCAGAGCGTCCCCAGCCTGGACCACGTTGCCATGACCGTGGC
GGTACTGCGCAGC
>NZ_CP018863.1_contig2
CCCGGGAGCTTCGTCGCAGTCCGGTGGGCCTCCTCCCAGTTCGTCTCAGCGCTGCTGCTG
GTCGGCGCCAAATTCCGTAACGGGCTGGCACCTCGAACATTCCGGCCAGAGCGTCCCCAG
CCTGGACCACGTTGCCATGACCGTGGCGGTACTGCGCAGC
>NZ_CP018863.1_contig3
GTCGGCGCCAAATTCCGTAACGGGCTGGCACCTCGAACATTCCGGCCAGAGCGTCCCCAG
GACGATCTGC
20 changes: 19 additions & 1 deletion q2_moshpit/tests/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,14 @@
# The full license is in the file LICENSE, distributed with this software.
# ----------------------------------------------------------------------------
import unittest

import pandas as pd
from qiime2.plugin.testing import TestPluginBase

from q2_types.feature_data_mag import MAGSequencesDirFmt
from .._utils import (
_construct_param, _process_common_input_params,
_calculate_md5_from_file
_calculate_md5_from_file, get_feature_lengths
)


Expand Down Expand Up @@ -123,6 +127,20 @@ def test_calculate_md5_from_fail(self):
observed_hash = _calculate_md5_from_file(path_to_file)
self.assertNotEqual(observed_hash, "a583054a9831a6e7cc56ea5cd9cac40a")

def test_get_feature_lengths(self):
mags = MAGSequencesDirFmt(self.get_data_path('mags-derep'), mode='r')
obs = get_feature_lengths(mags)
exp = pd.DataFrame({
'id': [
'24dee6fe-9b84-45bb-8145-de7b092533a1',
'ca7012fc-ba65-40c3-84f5-05aa478a7585',
'd65a71fa-4279-4588-b937-0747ed5d604d'
],
'length': [66, 70, 363]
})
exp.set_index('id', inplace=True)
pd.testing.assert_frame_equal(obs, exp)


if __name__ == '__main__':
unittest.main()
Loading