From 69d2a810eb54c014e645f6de8df92d500eb9d554 Mon Sep 17 00:00:00 2001 From: Santiago Castro Dau Date: Mon, 10 Jun 2024 14:41:59 +0200 Subject: [PATCH] renaming stuff --- q2_types/__init__.py | 2 +- q2_types/hmmer/__init__.py | 38 ---- q2_types/hmmer/_type.py | 135 -------------- .../tests/data/bacteria/bacteria.hmm.idmap | 19 -- .../tests/data/invalid_idmaps/1.hmm.idmap | 19 -- .../tests/data/invalid_idmaps/2.hmm.idmap | 19 -- .../tests/data/invalid_idmaps/3.hmm.idmap | 19 -- .../tests/data/invalid_idmaps/4.hmm.idmap | 19 -- q2_types/hmmer/tests/test_format.py | 174 ------------------ q2_types/profile_hmms/__init__.py | 42 +++++ q2_types/{hmmer => profile_hmms}/_format.py | 64 ++----- q2_types/profile_hmms/_type.py | 134 ++++++++++++++ .../{hmmer => profile_hmms}/tests/__init__.py | 0 .../tests/data/bacteria/bacteria.hmm.h3f | 0 .../tests/data/bacteria/bacteria.hmm.h3i | 0 .../tests/data/bacteria/bacteria.hmm.h3m | 0 .../tests/data/bacteria/bacteria.hmm.h3p | 0 .../tests/data/hmms/2_dna.hmm | 0 .../tests/data/hmms/2_rna.hmm | 0 .../tests/data/hmms/4_amino.hmm | 0 .../tests/data/hmms/amino.hmm | 0 .../tests/data/hmms/amino_dna.hmm | 0 .../tests/data/hmms/dna.hmm | 0 .../tests/data/hmms/rna.hmm | 0 .../tests/data/hmms/rna_dna.hmm | 0 q2_types/profile_hmms/tests/test_format.py | 103 +++++++++++ .../tests/test_type.py | 35 ++-- q2_types/reference_db/tests/test_format.py | 10 +- 28 files changed, 319 insertions(+), 513 deletions(-) delete mode 100644 q2_types/hmmer/__init__.py delete mode 100644 q2_types/hmmer/_type.py delete mode 100644 q2_types/hmmer/tests/data/bacteria/bacteria.hmm.idmap delete mode 100644 q2_types/hmmer/tests/data/invalid_idmaps/1.hmm.idmap delete mode 100644 q2_types/hmmer/tests/data/invalid_idmaps/2.hmm.idmap delete mode 100644 q2_types/hmmer/tests/data/invalid_idmaps/3.hmm.idmap delete mode 100644 q2_types/hmmer/tests/data/invalid_idmaps/4.hmm.idmap delete mode 100644 q2_types/hmmer/tests/test_format.py create mode 100644 q2_types/profile_hmms/__init__.py rename q2_types/{hmmer => profile_hmms}/_format.py (67%) create mode 100644 q2_types/profile_hmms/_type.py rename q2_types/{hmmer => profile_hmms}/tests/__init__.py (100%) rename q2_types/{hmmer => profile_hmms}/tests/data/bacteria/bacteria.hmm.h3f (100%) rename q2_types/{hmmer => profile_hmms}/tests/data/bacteria/bacteria.hmm.h3i (100%) rename q2_types/{hmmer => profile_hmms}/tests/data/bacteria/bacteria.hmm.h3m (100%) rename q2_types/{hmmer => profile_hmms}/tests/data/bacteria/bacteria.hmm.h3p (100%) rename q2_types/{hmmer => profile_hmms}/tests/data/hmms/2_dna.hmm (100%) rename q2_types/{hmmer => profile_hmms}/tests/data/hmms/2_rna.hmm (100%) rename q2_types/{hmmer => profile_hmms}/tests/data/hmms/4_amino.hmm (100%) rename q2_types/{hmmer => profile_hmms}/tests/data/hmms/amino.hmm (100%) rename q2_types/{hmmer => profile_hmms}/tests/data/hmms/amino_dna.hmm (100%) rename q2_types/{hmmer => profile_hmms}/tests/data/hmms/dna.hmm (100%) rename q2_types/{hmmer => profile_hmms}/tests/data/hmms/rna.hmm (100%) rename q2_types/{hmmer => profile_hmms}/tests/data/hmms/rna_dna.hmm (100%) create mode 100644 q2_types/profile_hmms/tests/test_format.py rename q2_types/{hmmer => profile_hmms}/tests/test_type.py (64%) diff --git a/q2_types/__init__.py b/q2_types/__init__.py index d78d5d56..8cb98d85 100644 --- a/q2_types/__init__.py +++ b/q2_types/__init__.py @@ -31,4 +31,4 @@ importlib.import_module('q2_types.genome_data') importlib.import_module('q2_types.kaiju') importlib.import_module('q2_types.reference_db') -importlib.import_module('q2_types.hmmer') +importlib.import_module('q2_types.profile_hmms') diff --git a/q2_types/hmmer/__init__.py b/q2_types/hmmer/__init__.py deleted file mode 100644 index 5e35863a..00000000 --- a/q2_types/hmmer/__init__.py +++ /dev/null @@ -1,38 +0,0 @@ -# ---------------------------------------------------------------------------- -# Copyright (c) 2023, QIIME 2 development team. -# -# Distributed under the terms of the Modified BSD License. -# -# The full license is in the file LICENSE, distributed with this software. -# ---------------------------------------------------------------------------- -from ._format import ( - AminoHmmMultipleProfilesFileFmt, - DnaHmmMultipleProfilesFileFmt, - RnaHmmMultipleProfilesFileFmt, - AminoHmmMultipleProfilesDirectoryFormat, - DnaHmmMultipleProfilesDirectoryFormat, - RnaHmmMultipleProfilesDirectoryFormat, - AminoHmmFileFmt, DnaHmmFileFmt, RnaHmmFileFmt, - AminoHmmDirectoryFormat, DnaHmmDirectoryFormat, RnaHmmDirectoryFormat, - BaseHmmPressedDirFmt -) -from ._type import ( - HMM, - SingleAmino, SingleDNA, SingleRNA, - MultipleAmino, MultipleDNA, MultipleRNA, - MultipleAminoPressed, MultipleDNAPressed, MultipleRNAPressed -) - -__all__ = [ - "AminoHmmMultipleProfilesFileFmt", "DnaHmmMultipleProfilesFileFmt", - "RnaHmmMultipleProfilesFileFmt", "AminoHmmMultipleProfilesDirectoryFormat", - "DnaHmmMultipleProfilesDirectoryFormat", - "RnaHmmMultipleProfilesDirectoryFormat", - "AminoHmmFileFmt", "DnaHmmFileFmt", "RnaHmmFileFmt", - "AminoHmmDirectoryFormat", "DnaHmmDirectoryFormat", - "RnaHmmDirectoryFormat", "HMM", - "SingleAmino", "SingleDNA", "SingleRNA", - "MultipleAmino", "MultipleDNA", "MultipleRNA", - "MultipleAminoPressed", "MultipleDNAPressed", "MultipleRNAPressed", - "BaseHmmPressedDirFmt" -] diff --git a/q2_types/hmmer/_type.py b/q2_types/hmmer/_type.py deleted file mode 100644 index 1714dacd..00000000 --- a/q2_types/hmmer/_type.py +++ /dev/null @@ -1,135 +0,0 @@ -# ---------------------------------------------------------------------------- -# Copyright (c) 2023, QIIME 2 development team. -# -# Distributed under the terms of the Modified BSD License. -# -# The full license is in the file LICENSE, distributed with this software. -# ---------------------------------------------------------------------------- -from qiime2.plugin import SemanticType -from q2_types.plugin_setup import plugin -from q2_types.hmmer._format import ( - AminoHmmMultipleProfilesDirectoryFormat, - DnaHmmMultipleProfilesDirectoryFormat, - RnaHmmMultipleProfilesDirectoryFormat, - AminoHmmDirectoryFormat, DnaHmmDirectoryFormat, RnaHmmDirectoryFormat, - BaseHmmPressedDirFmt -) - - -HMM = SemanticType('HMM', field_names='type') -SingleAmino = SemanticType( - 'SingleAmino', variant_of=HMM.field['type'] -) -SingleDNA = SemanticType( - 'SingleDNA', variant_of=HMM.field['type'] -) -SingleRNA = SemanticType( - 'SingleRNA', variant_of=HMM.field['type'] -) -MultipleAmino = SemanticType( - 'MultipleAmino', variant_of=HMM.field['type'] -) -MultipleDNA = SemanticType( - 'MultipleDNA', variant_of=HMM.field['type'] -) -MultipleRNA = SemanticType( - 'MultipleRNA', variant_of=HMM.field['type'] -) -MultipleAminoPressed = SemanticType( - 'MultipleAminoPressed', variant_of=HMM.field['type'] -) -MultipleDNAPressed = SemanticType( - 'MultipleDNAPressed', variant_of=HMM.field['type'] -) -MultipleRNAPressed = SemanticType( - 'MultipleRNAPressed', variant_of=HMM.field['type'] -) - -plugin.register_semantic_types( - HMM, - SingleAmino, SingleDNA, SingleRNA, - MultipleAmino, MultipleDNA, MultipleRNA, - MultipleAminoPressed, MultipleDNAPressed, - MultipleRNAPressed -) - -plugin.register_artifact_class( - HMM[MultipleAminoPressed], - directory_format=BaseHmmPressedDirFmt, - description=( - "A collection of Hidden Markov Model profiles for amino acid " - "sequences in binary format and indexed." - ) -) - -plugin.register_artifact_class( - HMM[MultipleDNAPressed], - directory_format=BaseHmmPressedDirFmt, - description=( - "A collection of Hidden Markov Model profiles for DNA " - "sequences in binary format and indexed." - ) -) - -plugin.register_artifact_class( - HMM[MultipleRNAPressed], - directory_format=BaseHmmPressedDirFmt, - description=( - "A collection of Hidden Markov Model profiles for RNA " - "sequences in binary format and indexed." - ) -) - -plugin.register_artifact_class( - HMM[SingleAmino], - directory_format=AminoHmmDirectoryFormat, - description=( - "One single Hidden Markov Model profile, representing a group " - "of related proteins." - ) -) - -plugin.register_artifact_class( - HMM[SingleDNA], - directory_format=DnaHmmDirectoryFormat, - description=( - "One single Hidden Markov Model profile, representing a group " - "of related DNA sequences." - ) -) - -plugin.register_artifact_class( - HMM[SingleRNA], - directory_format=RnaHmmDirectoryFormat, - description=( - "One single Hidden Markov Model profile, representing a group " - "of related RNA sequences." - ) -) - -plugin.register_artifact_class( - HMM[MultipleAmino], - directory_format=AminoHmmMultipleProfilesDirectoryFormat, - description=( - "A collection of Hidden Markov Model profiles, each representing a " - "group of related proteins." - ) -) - -plugin.register_artifact_class( - HMM[MultipleDNA], - directory_format=DnaHmmMultipleProfilesDirectoryFormat, - description=( - "A collection of Hidden Markov Model profiles, each representing a " - "group of related DNA sequences." - ) -) - -plugin.register_artifact_class( - HMM[MultipleRNA], - directory_format=RnaHmmMultipleProfilesDirectoryFormat, - description=( - "A collection of Hidden Markov Model profiles, each representing a " - "group of related RNA sequences." - ) -) diff --git a/q2_types/hmmer/tests/data/bacteria/bacteria.hmm.idmap b/q2_types/hmmer/tests/data/bacteria/bacteria.hmm.idmap deleted file mode 100644 index 4e7e0050..00000000 --- a/q2_types/hmmer/tests/data/bacteria/bacteria.hmm.idmap +++ /dev/null @@ -1,19 +0,0 @@ -1 1FKAT -2 1FIZK -3 1FIY1 -4 1FKA5 -5 1FIYP -6 1FK7D -7 1FIX5 -8 1FKCK -9 1FIXT -10 1FKBX -11 1FIYG -12 1FKAC -13 1FKB9 -14 1FK72 -15 1FK4H -16 1FK7S -17 1FK66 -18 1FK6W -19 1FIXC \ No newline at end of file diff --git a/q2_types/hmmer/tests/data/invalid_idmaps/1.hmm.idmap b/q2_types/hmmer/tests/data/invalid_idmaps/1.hmm.idmap deleted file mode 100644 index d8f7bc5c..00000000 --- a/q2_types/hmmer/tests/data/invalid_idmaps/1.hmm.idmap +++ /dev/null @@ -1,19 +0,0 @@ -1 1FKAT:"%#@ -2 1FIZK -3 1FIY1 -4 1FKA5 -5 1FIYP -6 1FK7D -7 1FIX5 -8 1FKCK -9 1FIXT -10 1FKBX -11 1FIYG -12 1FKAC -13 1FKB9 -14 1FK72 -15 1FK4H -16 1FK7S -17 1FK66 -18 1FK6W -19 1FIXC \ No newline at end of file diff --git a/q2_types/hmmer/tests/data/invalid_idmaps/2.hmm.idmap b/q2_types/hmmer/tests/data/invalid_idmaps/2.hmm.idmap deleted file mode 100644 index ea1cb3ba..00000000 --- a/q2_types/hmmer/tests/data/invalid_idmaps/2.hmm.idmap +++ /dev/null @@ -1,19 +0,0 @@ -1 1FKAT -2 1FIZK -3 1FIY1 -4 1FKA5 -5 1FIYP -6 1FK7D -7 1FIX5 -8 1FKCK -9 1FIXT -10 1FKBX -11 1FIYG -12 1FKAC -13 1FKB9 -14 1FK72 -15 1FK4H -16 1FK7S -17 1FK66 -18 1FK6W -19 1FIXC \ No newline at end of file diff --git a/q2_types/hmmer/tests/data/invalid_idmaps/3.hmm.idmap b/q2_types/hmmer/tests/data/invalid_idmaps/3.hmm.idmap deleted file mode 100644 index 1c28be88..00000000 --- a/q2_types/hmmer/tests/data/invalid_idmaps/3.hmm.idmap +++ /dev/null @@ -1,19 +0,0 @@ -1 1FKAT -2 1FIZK -3 1FIY1 -4 1FKA5 -5 1FIYP -6 1FK7D -7 1FIX5 -8 1FKCK -9 1FIXT -10 1FKBX -11 1FIYG -12 1FKAC -13 1FKB9 -14 1FK72 -15 1FK4H -16 1FK7S -17 1FK66 -18 1FK6W -20 1FIXC \ No newline at end of file diff --git a/q2_types/hmmer/tests/data/invalid_idmaps/4.hmm.idmap b/q2_types/hmmer/tests/data/invalid_idmaps/4.hmm.idmap deleted file mode 100644 index 260b29b8..00000000 --- a/q2_types/hmmer/tests/data/invalid_idmaps/4.hmm.idmap +++ /dev/null @@ -1,19 +0,0 @@ -1FKAT -2 1FIZK -3 1FIY1 -4 1FKA5 -5 1FIYP -6 1FK7D -7 1FIX5 -8 1FKCK -9 1FIXT -10 1FKBX -11 1FIYG -12 1FKAC -13 1FKB9 -14 1FK72 -15 1FK4H -16 1FK7S -17 1FK66 -18 1FK6W -19 1FIXC \ No newline at end of file diff --git a/q2_types/hmmer/tests/test_format.py b/q2_types/hmmer/tests/test_format.py deleted file mode 100644 index ca0c4803..00000000 --- a/q2_types/hmmer/tests/test_format.py +++ /dev/null @@ -1,174 +0,0 @@ -# ---------------------------------------------------------------------------- -# Copyright (c) 2023, QIIME 2 development team. -# -# Distributed under the terms of the Modified BSD License. -# -# The full license is in the file LICENSE, distributed with this software. -# ---------------------------------------------------------------------------- -import tempfile -import shutil -import os -from qiime2.plugin.testing import TestPluginBase -from q2_types.hmmer._format import ( - HmmIdmapFileFmt, BaseHmmPressedDirFmt, AminoHmmFileFmt, DnaHmmFileFmt, - RnaHmmFileFmt, AminoHmmMultipleProfilesFileFmt, - DnaHmmMultipleProfilesFileFmt, RnaHmmMultipleProfilesFileFmt -) -from qiime2.plugin import ValidationError - - -class TestHmmFormats(TestPluginBase): - package = 'q2_types.hmmer.tests' - - def test_HmmIdmapFileFmt_valid(self): - fmt = HmmIdmapFileFmt( - self.get_data_path("bacteria/bacteria.hmm.idmap"), 'r' - ) - fmt.validate() - - def test_HmmIdmapFileFmt_invalid_idmap_1(self): - fmt = HmmIdmapFileFmt( - self.get_data_path("invalid_idmaps/1.hmm.idmap"), 'r' - ) - with self.assertRaisesRegex( - ValidationError, - "Expected index and an alphanumeric code separated " - "by a single space." - ): - fmt.validate(level="min") - - def test_HmmIdmapFileFmt_invalid_idmap_2(self): - fmt = HmmIdmapFileFmt( - self.get_data_path("invalid_idmaps/2.hmm.idmap"), 'r' - ) - with self.assertRaisesRegex( - ValidationError, - "Expected index and an alphanumeric code separated " - "by a single space." - ): - fmt.validate(level="min") - - def test_HmmIdmapFileFmt_invalid_idmap_3(self): - fmt = HmmIdmapFileFmt( - self.get_data_path("invalid_idmaps/3.hmm.idmap"), 'r' - ) - with self.assertRaisesRegex( - ValidationError, - 'Expected index' - ): - fmt.validate(level="min") - - def test_HmmIdmapFileFmt_invalid_idmap_4(self): - fmt = HmmIdmapFileFmt( - self.get_data_path("invalid_idmaps/4.hmm.idmap"), 'r' - ) - with self.assertRaisesRegex( - ValidationError, - "Expected index and an alphanumeric code separated " - "by a single space." - ): - fmt.validate(level="min") - - def test_BaseHmmPressedDirFmt_missing_hmm(self): - with tempfile.TemporaryDirectory() as tmp: - shutil.copytree( - self.get_data_path("bacteria"), tmp, dirs_exist_ok=True - ) - os.remove(f"{tmp}/bacteria.hmm.h3f") - fmt = BaseHmmPressedDirFmt(tmp, 'r') - with self.assertRaisesRegex( - ValidationError, "Missing one or more files" - ): - fmt.validate(level="min") - - def test_BaseHmmPressedDirFmt_missing_idmap_ok(self): - with tempfile.TemporaryDirectory() as tmp: - shutil.copytree( - self.get_data_path("bacteria"), tmp, dirs_exist_ok=True - ) - os.remove(f"{tmp}/bacteria.hmm.idmap") - fmt = BaseHmmPressedDirFmt(tmp, 'r') - fmt.validate(level="min") - - def test_BaseHmmPressedDirFmt_valid(self): - fmt = BaseHmmPressedDirFmt(self.get_data_path("bacteria"), 'r') - fmt.validate(level="min") - - def test_AminoHmmFileFmt_valid(self): - fmt = AminoHmmFileFmt(self.get_data_path("hmms/amino.hmm"), "r") - fmt.validate() - - def test_DnaHmmFileFmt_valid(self): - fmt = DnaHmmFileFmt(self.get_data_path("hmms/dna.hmm"), "r") - fmt.validate() - - def test_RnaHmmFileFmt_valid(self): - fmt = RnaHmmFileFmt(self.get_data_path("hmms/rna.hmm"), "r") - fmt.validate() - - def test_AminoHmmFileFmt_invalid_alph(self): - for type in ["rna", "dna"]: - fmt = AminoHmmFileFmt(self.get_data_path(f"hmms/{type}.hmm"), "r") - with self.assertRaisesRegex( - ValidationError, "Found profile with alphabet " - ): - fmt.validate() - - def test_DnaHmmFileFmt_invalid_alph(self): - for type in ["rna", "amino"]: - fmt = DnaHmmFileFmt(self.get_data_path(f"hmms/{type}.hmm"), "r") - with self.assertRaisesRegex( - ValidationError, "Found profile with alphabet " - ): - fmt.validate() - - def test_RnaHmmFileFmt_invalid_alph(self): - for type in ["dna", "amino"]: - fmt = RnaHmmFileFmt(self.get_data_path(f"hmms/{type}.hmm"), "r") - with self.assertRaisesRegex( - ValidationError, "Found profile with alphabet " - ): - fmt.validate() - - def test_AminoHmmFileFmt_too_many_profiles(self): - fmt = AminoHmmFileFmt(self.get_data_path("hmms/4_amino.hmm"), "r") - with self.assertRaisesRegex( - ValidationError, "Expected 1 profile, found 4." - ): - fmt.validate() - - def test_AminoHmmMultipleProfilesFileFmt_valid(self): - fmt = AminoHmmMultipleProfilesFileFmt( - self.get_data_path("hmms/4_amino.hmm"), 'r' - ) - fmt.validate() - - def test_DnaHmmMultipleProfilesFileFmt_valid(self): - fmt = DnaHmmMultipleProfilesFileFmt( - self.get_data_path("hmms/2_dna.hmm"), "r" - ) - fmt.validate() - - def test_RnaHmmMultipleProfilesFileFmt_valid(self): - fmt = RnaHmmMultipleProfilesFileFmt( - self.get_data_path("hmms/2_rna.hmm"), "r" - ) - fmt.validate() - - def test_mixed_hmm_profiles_invalid_1(self): - fmt = AminoHmmMultipleProfilesFileFmt( - self.get_data_path("hmms/amino_dna.hmm"), 'r' - ) - with self.assertRaisesRegex( - ValidationError, "Found profiles with different alphabets." - ): - fmt.validate() - - def test_mixed_hmm_profiles_invalid_2(self): - fmt = DnaHmmMultipleProfilesFileFmt( - self.get_data_path("hmms/rna_dna.hmm"), 'r' - ) - with self.assertRaisesRegex( - ValidationError, "Found profiles with different alphabets." - ): - fmt.validate() diff --git a/q2_types/profile_hmms/__init__.py b/q2_types/profile_hmms/__init__.py new file mode 100644 index 00000000..cfdd5c6c --- /dev/null +++ b/q2_types/profile_hmms/__init__.py @@ -0,0 +1,42 @@ +# ---------------------------------------------------------------------------- +# Copyright (c) 2023, QIIME 2 development team. +# +# Distributed under the terms of the Modified BSD License. +# +# The full license is in the file LICENSE, distributed with this software. +# ---------------------------------------------------------------------------- +from ._format import ( + ProteinHmmMultipleProfilesFileFmt, + DnaHmmMultipleProfilesFileFmt, + RnaHmmMultipleProfilesFileFmt, + ProteinHmmMultipleProfilesDirectoryFormat, + DnaHmmMultipleProfilesDirectoryFormat, + RnaHmmMultipleProfilesDirectoryFormat, + ProteinHmmFileFmt, DnaHmmFileFmt, RnaHmmFileFmt, + ProteinHmmDirectoryFormat, DnaHmmDirectoryFormat, RnaHmmDirectoryFormat, + BaseHmmPressedDirFmt +) +from ._type import ( + ProfileHMM, + SingleProtein, SingleDNA, SingleRNA, + MultipleProtein, MultipleDNA, MultipleRNA, + PressedRNA, PressedDNA, PressedProtein +) + +__all__ = [ + "ProteinHmmMultipleProfilesFileFmt", + "DnaHmmMultipleProfilesFileFmt", + "RnaHmmMultipleProfilesFileFmt", + "ProteinHmmMultipleProfilesDirectoryFormat", + "DnaHmmMultipleProfilesDirectoryFormat", + "RnaHmmMultipleProfilesDirectoryFormat", + "ProteinHmmFileFmt", "DnaHmmFileFmt", "RnaHmmFileFmt", + "ProteinHmmDirectoryFormat", + "DnaHmmDirectoryFormat", + "RnaHmmDirectoryFormat", + "BaseHmmPressedDirFmt", + "ProfileHMM", + "SingleProtein", "SingleDNA", "SingleRNA", + "MultipleProtein", "MultipleDNA", "MultipleRNA", + "PressedRNA", "PressedDNA", "PressedProtein" +] diff --git a/q2_types/hmmer/_format.py b/q2_types/profile_hmms/_format.py similarity index 67% rename from q2_types/hmmer/_format.py rename to q2_types/profile_hmms/_format.py index 466c2495..b03ce069 100644 --- a/q2_types/hmmer/_format.py +++ b/q2_types/profile_hmms/_format.py @@ -5,7 +5,6 @@ # # The full license is in the file LICENSE, distributed with this software. # ---------------------------------------------------------------------------- -import re from pyhmmer.plan7 import HMMFile from qiime2.plugin import model from qiime2.core.exceptions import ValidationError @@ -17,36 +16,6 @@ def _validate_(self, level): pass -class HmmIdmapFileFmt(model.TextFileFormat): - def _validate_(self, level): - with open(str(self), 'r') as file: - # Set the number of rows to be parsed - max_lines = {"min": 100, "max": 10000000}[level] - lines = file.readlines() - for i, line in enumerate(lines, 1): - # Check number of lines parsed so far - if i > max_lines: - break - - # Validate line - if not re.match(r'^(\d+) ([A-Z0-9]+)$', line): - raise ValidationError( - f"Invalid line {i}.\n" - f"{line} \n" - "Expected index and an alphanumeric code separated " - "by a single space." - ) - - # Check index is equal to line number - idx, code = line.rstrip("\n").split(sep=" ") - if not idx == str(i): - raise ValidationError( - f"Invalid line {i}.\n" - f"{line} \n" - f"Expected index {i} but got {idx} instead.\n" - ) - - class BaseHmmPressedDirFmt(model.DirectoryFormat): """ The .h3m file contains the profile HMMs @@ -60,9 +29,6 @@ class BaseHmmPressedDirFmt(model.DirectoryFormat): h3i = model.File(r'.*\.hmm\.h3i', format=HmmBinaryFileFmt) h3f = model.File(r'.*\.hmm\.h3f', format=HmmBinaryFileFmt) h3p = model.File(r'.*\.hmm\.h3p', format=HmmBinaryFileFmt) - idmap = model.File( - r'.*\.hmm\.idmap', format=HmmIdmapFileFmt, optional=True - ) class HmmBaseFileFmt(model.TextFileFormat): @@ -102,7 +68,7 @@ def _validate_file_fmt( ) -class AminoHmmFileFmt(HmmBaseFileFmt): +class ProteinHmmFileFmt(HmmBaseFileFmt): alphabet = "amino" def _validate_(self, level): @@ -123,17 +89,17 @@ def _validate_(self, level): self._validate_file_fmt(level, self.alphabet, True) -AminoHmmDirectoryFormat = model.SingleFileDirectoryFormat( - 'AminoHmmFileFmt', 'profile.hmm', AminoHmmFileFmt) +ProteinHmmDirectoryFormat = model.SingleFileDirectoryFormat( + 'AminoHmmFileFmt', r'.*\..hmm', ProteinHmmFileFmt) DnaHmmDirectoryFormat = model.SingleFileDirectoryFormat( - 'DnaHmmFileFmt', 'profile.hmm', DnaHmmFileFmt) + 'DnaHmmFileFmt', r'.*\..hmm', DnaHmmFileFmt) RnaHmmDirectoryFormat = model.SingleFileDirectoryFormat( - 'RnaHmmFileFmt', 'profile.hmm', RnaHmmFileFmt) + 'RnaHmmFileFmt', r'.*\..hmm', RnaHmmFileFmt) -class AminoHmmMultipleProfilesFileFmt(AminoHmmFileFmt): +class ProteinHmmMultipleProfilesFileFmt(ProteinHmmFileFmt): def _validate_(self, level): self._validate_file_fmt(level, self.alphabet, False) @@ -148,28 +114,30 @@ def _validate_(self, level): self._validate_file_fmt(level, self.alphabet, False) -AminoHmmMultipleProfilesDirectoryFormat = model.SingleFileDirectoryFormat( +ProteinHmmMultipleProfilesDirectoryFormat = model.SingleFileDirectoryFormat( 'AminoHmmMultipleProfilesDirectoryFormat', - 'profile.hmm', - AminoHmmMultipleProfilesFileFmt + r'.*\..hmm', + ProteinHmmMultipleProfilesFileFmt ) DnaHmmMultipleProfilesDirectoryFormat = model.SingleFileDirectoryFormat( 'DnaHmmMultipleProfilesDirectoryFormat', - 'profile.hmm', + r'.*\..hmm', DnaHmmMultipleProfilesFileFmt, ) RnaHmmMultipleProfilesDirectoryFormat = model.SingleFileDirectoryFormat( 'RnaHmmMultipleProfilesDirectoryFormat', - 'profile.hmm', + r'.*\..hmm', RnaHmmMultipleProfilesFileFmt, ) plugin.register_formats( - AminoHmmMultipleProfilesFileFmt, DnaHmmMultipleProfilesFileFmt, - RnaHmmMultipleProfilesFileFmt, AminoHmmMultipleProfilesDirectoryFormat, + ProteinHmmMultipleProfilesFileFmt, + DnaHmmMultipleProfilesFileFmt, + RnaHmmMultipleProfilesFileFmt, + ProteinHmmMultipleProfilesDirectoryFormat, DnaHmmMultipleProfilesDirectoryFormat, RnaHmmMultipleProfilesDirectoryFormat, - AminoHmmDirectoryFormat, DnaHmmDirectoryFormat, RnaHmmDirectoryFormat + ProteinHmmDirectoryFormat, DnaHmmDirectoryFormat, RnaHmmDirectoryFormat ) diff --git a/q2_types/profile_hmms/_type.py b/q2_types/profile_hmms/_type.py new file mode 100644 index 00000000..86233016 --- /dev/null +++ b/q2_types/profile_hmms/_type.py @@ -0,0 +1,134 @@ +# ---------------------------------------------------------------------------- +# Copyright (c) 2023, QIIME 2 development team. +# +# Distributed under the terms of the Modified BSD License. +# +# The full license is in the file LICENSE, distributed with this software. +# ---------------------------------------------------------------------------- +from qiime2.plugin import SemanticType +from q2_types.plugin_setup import plugin +from q2_types.profile_hmms._format import ( + ProteinHmmMultipleProfilesDirectoryFormat, + DnaHmmMultipleProfilesDirectoryFormat, + RnaHmmMultipleProfilesDirectoryFormat, + ProteinHmmDirectoryFormat, DnaHmmDirectoryFormat, RnaHmmDirectoryFormat, + BaseHmmPressedDirFmt +) + + +ProfileHMM = SemanticType('ProfileHMM', field_names='type') +SingleProtein = SemanticType( + 'SingleProtein', variant_of=ProfileHMM.field['type'] +) +SingleDNA = SemanticType( + 'SingleDNA', variant_of=ProfileHMM.field['type'] +) +SingleRNA = SemanticType( + 'SingleRNA', variant_of=ProfileHMM.field['type'] +) +MultipleProtein = SemanticType( + 'MultipleProtein', variant_of=ProfileHMM.field['type'] +) +MultipleDNA = SemanticType( + 'MultipleDNA', variant_of=ProfileHMM.field['type'] +) +MultipleRNA = SemanticType( + 'MultipleRNA', variant_of=ProfileHMM.field['type'] +) +PressedProtein = SemanticType( + 'PressedProtein', variant_of=ProfileHMM.field['type'] +) +PressedDNA = SemanticType( + 'PressedDNA', variant_of=ProfileHMM.field['type'] +) +PressedRNA = SemanticType( + 'PressedRNA', variant_of=ProfileHMM.field['type'] +) + +plugin.register_semantic_types( + ProfileHMM, + SingleProtein, SingleDNA, SingleRNA, + MultipleProtein, MultipleDNA, MultipleRNA, + PressedProtein, PressedDNA, PressedRNA +) + +plugin.register_artifact_class( + ProfileHMM[PressedProtein], + directory_format=BaseHmmPressedDirFmt, + description=( + "A collection of profile Hidden Markov Models for amino acid " + "sequences in binary format and indexed." + ) +) + +plugin.register_artifact_class( + ProfileHMM[PressedDNA], + directory_format=BaseHmmPressedDirFmt, + description=( + "A collection of profile Hidden Markov Models for DNA " + "sequences in binary format and indexed." + ) +) + +plugin.register_artifact_class( + ProfileHMM[PressedRNA], + directory_format=BaseHmmPressedDirFmt, + description=( + "A collection of profile Hidden Markov Models for RNA " + "sequences in binary format and indexed." + ) +) + +plugin.register_artifact_class( + ProfileHMM[SingleProtein], + directory_format=ProteinHmmDirectoryFormat, + description=( + "One single profile Hidden Markov Model representing a group " + "of related proteins." + ) +) + +plugin.register_artifact_class( + ProfileHMM[SingleDNA], + directory_format=DnaHmmDirectoryFormat, + description=( + "One single profile Hidden Markov Model representing a group " + "of related DNA sequences." + ) +) + +plugin.register_artifact_class( + ProfileHMM[SingleRNA], + directory_format=RnaHmmDirectoryFormat, + description=( + "One single profile Hidden Markov Model representing a group " + "of related RNA sequences." + ) +) + +plugin.register_artifact_class( + ProfileHMM[MultipleProtein], + directory_format=ProteinHmmMultipleProfilesDirectoryFormat, + description=( + "A collection of profile Hidden Markov Models, " + "each representing a group of related proteins." + ) +) + +plugin.register_artifact_class( + ProfileHMM[MultipleDNA], + directory_format=DnaHmmMultipleProfilesDirectoryFormat, + description=( + "A collection of profile Hidden Markov Models, " + "each representing a group of related DNA sequences." + ) +) + +plugin.register_artifact_class( + ProfileHMM[MultipleRNA], + directory_format=RnaHmmMultipleProfilesDirectoryFormat, + description=( + "A collection of profile Hidden Markov Models, " + "each representing a group of related RNA sequences." + ) +) diff --git a/q2_types/hmmer/tests/__init__.py b/q2_types/profile_hmms/tests/__init__.py similarity index 100% rename from q2_types/hmmer/tests/__init__.py rename to q2_types/profile_hmms/tests/__init__.py diff --git a/q2_types/hmmer/tests/data/bacteria/bacteria.hmm.h3f b/q2_types/profile_hmms/tests/data/bacteria/bacteria.hmm.h3f similarity index 100% rename from q2_types/hmmer/tests/data/bacteria/bacteria.hmm.h3f rename to q2_types/profile_hmms/tests/data/bacteria/bacteria.hmm.h3f diff --git a/q2_types/hmmer/tests/data/bacteria/bacteria.hmm.h3i b/q2_types/profile_hmms/tests/data/bacteria/bacteria.hmm.h3i similarity index 100% rename from q2_types/hmmer/tests/data/bacteria/bacteria.hmm.h3i rename to q2_types/profile_hmms/tests/data/bacteria/bacteria.hmm.h3i diff --git a/q2_types/hmmer/tests/data/bacteria/bacteria.hmm.h3m b/q2_types/profile_hmms/tests/data/bacteria/bacteria.hmm.h3m similarity index 100% rename from q2_types/hmmer/tests/data/bacteria/bacteria.hmm.h3m rename to q2_types/profile_hmms/tests/data/bacteria/bacteria.hmm.h3m diff --git a/q2_types/hmmer/tests/data/bacteria/bacteria.hmm.h3p b/q2_types/profile_hmms/tests/data/bacteria/bacteria.hmm.h3p similarity index 100% rename from q2_types/hmmer/tests/data/bacteria/bacteria.hmm.h3p rename to q2_types/profile_hmms/tests/data/bacteria/bacteria.hmm.h3p diff --git a/q2_types/hmmer/tests/data/hmms/2_dna.hmm b/q2_types/profile_hmms/tests/data/hmms/2_dna.hmm similarity index 100% rename from q2_types/hmmer/tests/data/hmms/2_dna.hmm rename to q2_types/profile_hmms/tests/data/hmms/2_dna.hmm diff --git a/q2_types/hmmer/tests/data/hmms/2_rna.hmm b/q2_types/profile_hmms/tests/data/hmms/2_rna.hmm similarity index 100% rename from q2_types/hmmer/tests/data/hmms/2_rna.hmm rename to q2_types/profile_hmms/tests/data/hmms/2_rna.hmm diff --git a/q2_types/hmmer/tests/data/hmms/4_amino.hmm b/q2_types/profile_hmms/tests/data/hmms/4_amino.hmm similarity index 100% rename from q2_types/hmmer/tests/data/hmms/4_amino.hmm rename to q2_types/profile_hmms/tests/data/hmms/4_amino.hmm diff --git a/q2_types/hmmer/tests/data/hmms/amino.hmm b/q2_types/profile_hmms/tests/data/hmms/amino.hmm similarity index 100% rename from q2_types/hmmer/tests/data/hmms/amino.hmm rename to q2_types/profile_hmms/tests/data/hmms/amino.hmm diff --git a/q2_types/hmmer/tests/data/hmms/amino_dna.hmm b/q2_types/profile_hmms/tests/data/hmms/amino_dna.hmm similarity index 100% rename from q2_types/hmmer/tests/data/hmms/amino_dna.hmm rename to q2_types/profile_hmms/tests/data/hmms/amino_dna.hmm diff --git a/q2_types/hmmer/tests/data/hmms/dna.hmm b/q2_types/profile_hmms/tests/data/hmms/dna.hmm similarity index 100% rename from q2_types/hmmer/tests/data/hmms/dna.hmm rename to q2_types/profile_hmms/tests/data/hmms/dna.hmm diff --git a/q2_types/hmmer/tests/data/hmms/rna.hmm b/q2_types/profile_hmms/tests/data/hmms/rna.hmm similarity index 100% rename from q2_types/hmmer/tests/data/hmms/rna.hmm rename to q2_types/profile_hmms/tests/data/hmms/rna.hmm diff --git a/q2_types/hmmer/tests/data/hmms/rna_dna.hmm b/q2_types/profile_hmms/tests/data/hmms/rna_dna.hmm similarity index 100% rename from q2_types/hmmer/tests/data/hmms/rna_dna.hmm rename to q2_types/profile_hmms/tests/data/hmms/rna_dna.hmm diff --git a/q2_types/profile_hmms/tests/test_format.py b/q2_types/profile_hmms/tests/test_format.py new file mode 100644 index 00000000..e7cdd320 --- /dev/null +++ b/q2_types/profile_hmms/tests/test_format.py @@ -0,0 +1,103 @@ +# ---------------------------------------------------------------------------- +# Copyright (c) 2023, QIIME 2 development team. +# +# Distributed under the terms of the Modified BSD License. +# +# The full license is in the file LICENSE, distributed with this software. +# ---------------------------------------------------------------------------- +from qiime2.plugin.testing import TestPluginBase +from q2_types.profile_hmms._format import ( + BaseHmmPressedDirFmt, + ProteinHmmFileFmt, DnaHmmFileFmt, RnaHmmFileFmt, + ProteinHmmMultipleProfilesFileFmt, + DnaHmmMultipleProfilesFileFmt, + RnaHmmMultipleProfilesFileFmt +) +from qiime2.plugin import ValidationError + + +class TestHmmFormats(TestPluginBase): + package = 'q2_types.hmmer.tests' + + def test_BaseHmmPressedDirFmt_valid(self): + fmt = BaseHmmPressedDirFmt(self.get_data_path("bacteria"), 'r') + fmt.validate(level="min") + + def test_AminoHmmFileFmt_valid(self): + fmt = ProteinHmmFileFmt(self.get_data_path("hmms/amino.hmm"), "r") + fmt.validate() + + def test_DnaHmmFileFmt_valid(self): + fmt = DnaHmmFileFmt(self.get_data_path("hmms/dna.hmm"), "r") + fmt.validate() + + def test_RnaHmmFileFmt_valid(self): + fmt = RnaHmmFileFmt(self.get_data_path("hmms/rna.hmm"), "r") + fmt.validate() + + def test_AminoHmmFileFmt_invalid_alph(self): + for typ in ["rna", "dna"]: + fmt = ProteinHmmFileFmt(self.get_data_path(f"hmms/{typ}.hmm"), "r") + with self.assertRaisesRegex( + ValidationError, "Found profile with alphabet " + ): + fmt.validate() + + def test_DnaHmmFileFmt_invalid_alph(self): + for typ in ["rna", "amino"]: + fmt = DnaHmmFileFmt(self.get_data_path(f"hmms/{typ}.hmm"), "r") + with self.assertRaisesRegex( + ValidationError, "Found profile with alphabet " + ): + fmt.validate() + + def test_RnaHmmFileFmt_invalid_alph(self): + for typ in ["dna", "amino"]: + fmt = RnaHmmFileFmt(self.get_data_path(f"hmms/{typ}.hmm"), "r") + with self.assertRaisesRegex( + ValidationError, "Found profile with alphabet " + ): + fmt.validate() + + def test_AminoHmmFileFmt_too_many_profiles(self): + fmt = ProteinHmmFileFmt(self.get_data_path("hmms/4_amino.hmm"), "r") + with self.assertRaisesRegex( + ValidationError, "Expected 1 profile, found 4." + ): + fmt.validate() + + def test_AminoHmmMultipleProfilesFileFmt_valid(self): + fmt = ProteinHmmMultipleProfilesFileFmt( + self.get_data_path("hmms/4_amino.hmm"), 'r' + ) + fmt.validate() + + def test_DnaHmmMultipleProfilesFileFmt_valid(self): + fmt = DnaHmmMultipleProfilesFileFmt( + self.get_data_path("hmms/2_dna.hmm"), "r" + ) + fmt.validate() + + def test_RnaHmmMultipleProfilesFileFmt_valid(self): + fmt = RnaHmmMultipleProfilesFileFmt( + self.get_data_path("hmms/2_rna.hmm"), "r" + ) + fmt.validate() + + def test_mixed_hmm_profiles_invalid_1(self): + fmt = ProteinHmmMultipleProfilesFileFmt( + self.get_data_path("hmms/amino_dna.hmm"), 'r' + ) + with self.assertRaisesRegex( + ValidationError, "Found profiles with different alphabets." + ): + fmt.validate() + + def test_mixed_hmm_profiles_invalid_2(self): + fmt = DnaHmmMultipleProfilesFileFmt( + self.get_data_path("hmms/rna_dna.hmm"), 'r' + ) + with self.assertRaisesRegex( + ValidationError, "Found profiles with different alphabets." + ): + fmt.validate() diff --git a/q2_types/hmmer/tests/test_type.py b/q2_types/profile_hmms/tests/test_type.py similarity index 64% rename from q2_types/hmmer/tests/test_type.py rename to q2_types/profile_hmms/tests/test_type.py index 734aa00c..ed040495 100644 --- a/q2_types/hmmer/tests/test_type.py +++ b/q2_types/profile_hmms/tests/test_type.py @@ -6,15 +6,15 @@ # The full license is in the file LICENSE, distributed with this software. # ---------------------------------------------------------------------------- from qiime2.plugin.testing import TestPluginBase -from q2_types.hmmer import ( - HMM, BaseHmmPressedDirFmt, - AminoHmmMultipleProfilesDirectoryFormat, +from q2_types.profile_hmms._type import ( + ProfileHMM, BaseHmmPressedDirFmt, + ProteinHmmMultipleProfilesDirectoryFormat, DnaHmmMultipleProfilesDirectoryFormat, RnaHmmMultipleProfilesDirectoryFormat, - AminoHmmDirectoryFormat, DnaHmmDirectoryFormat, RnaHmmDirectoryFormat, - SingleAmino, SingleDNA, SingleRNA, - MultipleAmino, MultipleDNA, MultipleRNA, - MultipleAminoPressed, MultipleDNAPressed, MultipleRNAPressed + ProteinHmmDirectoryFormat, DnaHmmDirectoryFormat, RnaHmmDirectoryFormat, + SingleProtein, SingleDNA, SingleRNA, + MultipleProtein, MultipleDNA, MultipleRNA, + PressedProtein, PressedDNA, PressedRNA ) @@ -22,49 +22,50 @@ class TestHMMType(TestPluginBase): package = 'q2_types.reference_db.tests' def test_hmmer_registration(self): - self.assertRegisteredSemanticType(HMM) + self.assertRegisteredSemanticType(ProfileHMM) def test_SingleAmino_semantic_type_registered_to_DirFmt(self): self.assertSemanticTypeRegisteredToFormat( - HMM[SingleAmino], AminoHmmDirectoryFormat + ProfileHMM[SingleProtein], ProteinHmmDirectoryFormat ) def test_SingleDNA_semantic_type_registered_to_DirFmt(self): self.assertSemanticTypeRegisteredToFormat( - HMM[SingleDNA], DnaHmmDirectoryFormat + ProfileHMM[SingleDNA], DnaHmmDirectoryFormat ) def test_SingleRNA_semantic_type_registered_to_DirFmt(self): self.assertSemanticTypeRegisteredToFormat( - HMM[SingleRNA], RnaHmmDirectoryFormat + ProfileHMM[SingleRNA], RnaHmmDirectoryFormat ) def test_MultipleAmino_semantic_type_registered_to_DirFmt(self): self.assertSemanticTypeRegisteredToFormat( - HMM[MultipleAmino], AminoHmmMultipleProfilesDirectoryFormat + ProfileHMM[MultipleProtein], + ProteinHmmMultipleProfilesDirectoryFormat ) def test_MultipleDNA_semantic_type_registered_to_DirFmt(self): self.assertSemanticTypeRegisteredToFormat( - HMM[MultipleDNA], DnaHmmMultipleProfilesDirectoryFormat + ProfileHMM[MultipleDNA], DnaHmmMultipleProfilesDirectoryFormat ) def test_MultipleRNA_semantic_type_registered_to_DirFmt(self): self.assertSemanticTypeRegisteredToFormat( - HMM[MultipleRNA], RnaHmmMultipleProfilesDirectoryFormat + ProfileHMM[MultipleRNA], RnaHmmMultipleProfilesDirectoryFormat ) def test_MultipleAminoPressed_semantic_type_registered_to_DirFmt(self): self.assertSemanticTypeRegisteredToFormat( - HMM[MultipleAminoPressed], BaseHmmPressedDirFmt + ProfileHMM[PressedProtein], BaseHmmPressedDirFmt ) def test_MultipleDNAPressed_semantic_type_registered_to_DirFmt(self): self.assertSemanticTypeRegisteredToFormat( - HMM[MultipleDNAPressed], BaseHmmPressedDirFmt + ProfileHMM[PressedDNA], BaseHmmPressedDirFmt ) def test_MultipleRNAPressed_semantic_type_registered_to_DirFmt(self): self.assertSemanticTypeRegisteredToFormat( - HMM[MultipleRNAPressed], BaseHmmPressedDirFmt + ProfileHMM[PressedRNA], BaseHmmPressedDirFmt ) diff --git a/q2_types/reference_db/tests/test_format.py b/q2_types/reference_db/tests/test_format.py index 105b27da..41218089 100644 --- a/q2_types/reference_db/tests/test_format.py +++ b/q2_types/reference_db/tests/test_format.py @@ -7,11 +7,11 @@ # ---------------------------------------------------------------------------- from qiime2.plugin.testing import TestPluginBase from q2_types.reference_db._format import ( - DiamondDatabaseFileFmt, DiamondDatabaseDirFmt, EggnogRefBinFileFmt, - EggnogRefDirFmt, NCBITaxonomyNamesFormat, NCBITaxonomyNodesFormat, - NCBITaxonomyDirFmt, NCBITaxonomyBinaryFileFmt, - EggnogProteinSequencesDirFmt, EggnogRefTextFileFmt - ) + DiamondDatabaseFileFmt, DiamondDatabaseDirFmt, EggnogRefBinFileFmt, + EggnogRefDirFmt, NCBITaxonomyNamesFormat, NCBITaxonomyNodesFormat, + NCBITaxonomyDirFmt, NCBITaxonomyBinaryFileFmt, + EggnogProteinSequencesDirFmt, EggnogRefTextFileFmt +) from qiime2.plugin import ValidationError