From a1b8961b82205cbaaf236b0a5f5e2589966d9eca Mon Sep 17 00:00:00 2001 From: Michal Ziemski Date: Fri, 17 May 2024 11:49:46 +0200 Subject: [PATCH] ENH: add action to get feature lengths (#164) --- q2_moshpit/__init__.py | 4 +++- q2_moshpit/_utils.py | 18 +++++++++++++++ q2_moshpit/plugin_setup.py | 22 ++++++++++++++++++- ...24dee6fe-9b84-45bb-8145-de7b092533a1.fasta | 4 ++++ ...ca7012fc-ba65-40c3-84f5-05aa478a7585.fasta | 6 +++++ ...d65a71fa-4279-4588-b937-0747ed5d604d.fasta | 11 ++++++++++ q2_moshpit/tests/test_utils.py | 20 ++++++++++++++++- 7 files changed, 82 insertions(+), 3 deletions(-) create mode 100644 q2_moshpit/tests/data/mags-derep/24dee6fe-9b84-45bb-8145-de7b092533a1.fasta create mode 100644 q2_moshpit/tests/data/mags-derep/ca7012fc-ba65-40c3-84f5-05aa478a7585.fasta create mode 100644 q2_moshpit/tests/data/mags-derep/d65a71fa-4279-4588-b937-0747ed5d604d.fasta diff --git a/q2_moshpit/__init__.py b/q2_moshpit/__init__.py index b2be07a7..4f22498f 100644 --- a/q2_moshpit/__init__.py +++ b/q2_moshpit/__init__.py @@ -19,6 +19,7 @@ helpers as kraken_helpers ) from .metabat2 import metabat2 +from ._utils import get_feature_lengths __version__ = get_versions()['version'] del get_versions @@ -26,5 +27,6 @@ __all__ = [ 'metabat2', 'bracken', 'kraken_class', 'kraken_db', 'kaiju_class', 'kaiju_db', 'dereplicate_mags', 'eggnog', - 'busco', 'prodigal', 'kraken_helpers', 'partition' + 'busco', 'prodigal', 'kraken_helpers', 'partition', + 'get_feature_lengths' ] diff --git a/q2_moshpit/_utils.py b/q2_moshpit/_utils.py index f93efe17..1f6a94a7 100644 --- a/q2_moshpit/_utils.py +++ b/q2_moshpit/_utils.py @@ -9,6 +9,11 @@ import hashlib from typing import List +import pandas as pd +import skbio + +from q2_types.feature_data_mag import MAGSequencesDirFmt + def run_command(cmd, env=None, verbose=True, pipe=False, **kwargs): if verbose: @@ -84,3 +89,16 @@ def _calculate_md5_from_file(file_path: str) -> str: for chunk in iter(lambda: f.read(4096), b""): md5_hash.update(chunk) return md5_hash.hexdigest() + + +def get_feature_lengths(features: MAGSequencesDirFmt) -> pd.DataFrame: + """Calculate lengths of features in a feature data object.""" + ids, lengths = [], [] + for _id, fp in features.feature_dict().items(): + sequences = skbio.io.read(fp, format='fasta', verify=False) + ids.append(_id) + lengths.append(sum(len(seq) for seq in sequences)) + + df = pd.DataFrame({'id': ids, 'length': lengths}) + df.set_index('id', inplace=True) + return df diff --git a/q2_moshpit/plugin_setup.py b/q2_moshpit/plugin_setup.py index 662e1f1a..1d0d67db 100644 --- a/q2_moshpit/plugin_setup.py +++ b/q2_moshpit/plugin_setup.py @@ -12,7 +12,7 @@ ) from q2_types.distance_matrix import DistanceMatrix from q2_types.feature_data import ( - FeatureData, Sequence, Taxonomy, ProteinSequence + FeatureData, Sequence, Taxonomy, ProteinSequence, SequenceCharacteristics ) from q2_types.feature_table import FeatureTable, Frequency, PresenceAbsence from q2_types.per_sample_sequences import ( @@ -1194,6 +1194,26 @@ citations=[citations["menzel2016"]], ) +plugin.methods.register_function( + function=q2_moshpit._utils.get_feature_lengths, + inputs={ + "features": FeatureData[MAG], + }, + parameters={}, + outputs=[ + ('lengths', + FeatureData[SequenceCharacteristics % Properties('length')]) + ], + input_descriptions={ + "features": "Features to get lengths for." + }, + parameter_descriptions={}, + output_descriptions={'lengths': 'Feature lengths.', }, + name='Get feature lengths.', + description='This method extract lengths for the provided feature set.', + citations=[] +) + plugin.register_semantic_types(BUSCOResults) plugin.register_semantic_type_to_format( BUSCOResults, diff --git a/q2_moshpit/tests/data/mags-derep/24dee6fe-9b84-45bb-8145-de7b092533a1.fasta b/q2_moshpit/tests/data/mags-derep/24dee6fe-9b84-45bb-8145-de7b092533a1.fasta new file mode 100644 index 00000000..b13d36ca --- /dev/null +++ b/q2_moshpit/tests/data/mags-derep/24dee6fe-9b84-45bb-8145-de7b092533a1.fasta @@ -0,0 +1,4 @@ +>NZ_00000000.1_contig1 +ATGTTTCCAGATGCAATGCGTGGGCACTG +>NZ_00000000.1_contig2 +TTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGT diff --git a/q2_moshpit/tests/data/mags-derep/ca7012fc-ba65-40c3-84f5-05aa478a7585.fasta b/q2_moshpit/tests/data/mags-derep/ca7012fc-ba65-40c3-84f5-05aa478a7585.fasta new file mode 100644 index 00000000..5b5f61ef --- /dev/null +++ b/q2_moshpit/tests/data/mags-derep/ca7012fc-ba65-40c3-84f5-05aa478a7585.fasta @@ -0,0 +1,6 @@ +>NZ_CP007255.1_contig1 +GGCGTGCCCTCA +>NZ_CP007255.1_contig2 +GGAACGAACTCGGTCCAGGCGTTCTCCCACAT +>NZ_CP007255.1_contig3 +ATGATCGATCCTTCCCACCAAGCTCA diff --git a/q2_moshpit/tests/data/mags-derep/d65a71fa-4279-4588-b937-0747ed5d604d.fasta b/q2_moshpit/tests/data/mags-derep/d65a71fa-4279-4588-b937-0747ed5d604d.fasta new file mode 100644 index 00000000..ea32c781 --- /dev/null +++ b/q2_moshpit/tests/data/mags-derep/d65a71fa-4279-4588-b937-0747ed5d604d.fasta @@ -0,0 +1,11 @@ +>NZ_CP018863.1_contig1 +GCCTCCTCCCAGTTCGTCTCAGCGCTGCTGCTGGTCGGCGCCAAATTCCGTAACGGGCTG +GCACCTCGAACATTCCGGCCAGAGCGTCCCCAGCCTGGACCACGTTGCCATGACCGTGGC +GGTACTGCGCAGC +>NZ_CP018863.1_contig2 +CCCGGGAGCTTCGTCGCAGTCCGGTGGGCCTCCTCCCAGTTCGTCTCAGCGCTGCTGCTG +GTCGGCGCCAAATTCCGTAACGGGCTGGCACCTCGAACATTCCGGCCAGAGCGTCCCCAG +CCTGGACCACGTTGCCATGACCGTGGCGGTACTGCGCAGC +>NZ_CP018863.1_contig3 +GTCGGCGCCAAATTCCGTAACGGGCTGGCACCTCGAACATTCCGGCCAGAGCGTCCCCAG +GACGATCTGC diff --git a/q2_moshpit/tests/test_utils.py b/q2_moshpit/tests/test_utils.py index 780a10c7..9e775ab6 100644 --- a/q2_moshpit/tests/test_utils.py +++ b/q2_moshpit/tests/test_utils.py @@ -6,10 +6,14 @@ # The full license is in the file LICENSE, distributed with this software. # ---------------------------------------------------------------------------- import unittest + +import pandas as pd from qiime2.plugin.testing import TestPluginBase + +from q2_types.feature_data_mag import MAGSequencesDirFmt from .._utils import ( _construct_param, _process_common_input_params, - _calculate_md5_from_file + _calculate_md5_from_file, get_feature_lengths ) @@ -123,6 +127,20 @@ def test_calculate_md5_from_fail(self): observed_hash = _calculate_md5_from_file(path_to_file) self.assertNotEqual(observed_hash, "a583054a9831a6e7cc56ea5cd9cac40a") + def test_get_feature_lengths(self): + mags = MAGSequencesDirFmt(self.get_data_path('mags-derep'), mode='r') + obs = get_feature_lengths(mags) + exp = pd.DataFrame({ + 'id': [ + '24dee6fe-9b84-45bb-8145-de7b092533a1', + 'ca7012fc-ba65-40c3-84f5-05aa478a7585', + 'd65a71fa-4279-4588-b937-0747ed5d604d' + ], + 'length': [66, 70, 363] + }) + exp.set_index('id', inplace=True) + pd.testing.assert_frame_equal(obs, exp) + if __name__ == '__main__': unittest.main()