Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ENH: add ProfileHMM[*] semantic types #328

Merged
merged 29 commits into from
Jun 25, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
29 commits
Select commit Hold shift + click to select a range
70e68ca
implement semantic type for hmmer db + tests
Sann5 May 16, 2024
0d729a3
Add format and type to reference_db.__init__.py
Sann5 May 17, 2024
571d35f
idmap file fmt + tests
Sann5 May 23, 2024
1fd9c9b
work in progress. hmm file parser
Sann5 May 30, 2024
f1ee960
use the pyhmmer validation for hmm files
Sann5 May 31, 2024
6bf003f
Merge remote-tracking branch 'upstream/dev' into st_hmmer_db
Sann5 May 31, 2024
cb20829
made st and formats or each alphabet type
Sann5 May 31, 2024
d004015
renamed formats and st
Sann5 May 31, 2024
8acdb89
migrate types and test data to new module. TODO migrate tests
Sann5 May 31, 2024
f95358e
remove dif from irrelevant files
Sann5 May 31, 2024
9c8447f
same as las commit
Sann5 May 31, 2024
90bdde4
more changes to reduce diff. TODO: adjust tests
Sann5 May 31, 2024
cd7ecff
fix registrations. work in progress
Sann5 May 31, 2024
5453694
add descriptions and fix circular import
Sann5 Jun 3, 2024
9b1a8c1
remove word profile
Sann5 Jun 3, 2024
f19fb3b
add tests for type
Sann5 Jun 3, 2024
890f0d7
test the fomrats. tbc
Sann5 Jun 3, 2024
6ec9360
more tests for formats
Sann5 Jun 4, 2024
d26f278
add pyhmmer to dependencies
Sann5 Jun 4, 2024
69d2a81
renaming stuff
Sann5 Jun 10, 2024
c6e61af
update package in tests
Sann5 Jun 10, 2024
9ae98e1
change to name an classes
Sann5 Jun 10, 2024
39887c8
add to the error messsage of multiple profiles
Sann5 Jun 10, 2024
6d127e5
include testing data in setup.py
Sann5 Jun 11, 2024
1e450c5
typo
Sann5 Jun 11, 2024
efc32b2
class varaible in **ProfileHmmFileFmt() from single to single_profile
Sann5 Jun 20, 2024
054d5da
change format names from D/Rna to D/RNA
Sann5 Jun 20, 2024
24f148a
update mixed_hmm_profiles tests compatible with pyhmmer 0.10.13
Sann5 Jun 21, 2024
ff5e807
catch AlphabetMismatch as ValidationError
Sann5 Jun 25, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions ci/recipe/meta.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ requirements:
- h5py
- qiime2 {{ qiime2_epoch }}.*
- samtools
- pyhmmer

test:
commands:
Expand Down
1 change: 1 addition & 0 deletions q2_types/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,3 +31,4 @@
importlib.import_module('q2_types.genome_data')
importlib.import_module('q2_types.kaiju')
importlib.import_module('q2_types.reference_db')
importlib.import_module('q2_types.profile_hmms')
38 changes: 38 additions & 0 deletions q2_types/profile_hmms/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
# ----------------------------------------------------------------------------
# Copyright (c) 2023, QIIME 2 development team.
#
# Distributed under the terms of the Modified BSD License.
#
# The full license is in the file LICENSE, distributed with this software.
# ----------------------------------------------------------------------------
from ._format import (
PressedProfileHmmsDirectoryFmt,
ProteinSingleProfileHmmDirectoryFmt,
ProteinMultipleProfileHmmDirectoryFmt,
DNASingleProfileHmmDirectoryFmt,
DNAMultipleProfileHmmDirectoryFmt,
RNASingleProfileHmmDirectoryFmt,
RNAMultipleProfileHmmDirectoryFmt
)
from ._type import (
ProfileHMM,
SingleProtein, SingleDNA, SingleRNA,
MultipleProtein, MultipleDNA, MultipleRNA,
PressedRNA, PressedDNA, PressedProtein
)

__all__ = [
"ProfileHmmBinaryFileFmt",
"PressedProfileHmmsDirectoryFmt",
"ProfileHmmFileFmt",
"ProteinSingleProfileHmmDirectoryFmt",
"ProteinMultipleProfileHmmDirectoryFmt",
"DNASingleProfileHmmDirectoryFmt",
"DNAMultipleProfileHmmDirectoryFmt",
"RNASingleProfileHmmDirectoryFmt",
"RNAMultipleProfileHmmDirectoryFmt",
"ProfileHMM",
"SingleProtein", "SingleDNA", "SingleRNA",
"MultipleProtein", "MultipleDNA", "MultipleRNA",
"PressedRNA", "PressedDNA", "PressedProtein"
]
138 changes: 138 additions & 0 deletions q2_types/profile_hmms/_format.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,138 @@
# ----------------------------------------------------------------------------
# Copyright (c) 2023, QIIME 2 development team.
#
# Distributed under the terms of the Modified BSD License.
#
# The full license is in the file LICENSE, distributed with this software.
# ----------------------------------------------------------------------------
from pyhmmer.plan7 import HMMFile
from pyhmmer.errors import AlphabetMismatch
from qiime2.plugin import model
from qiime2.core.exceptions import ValidationError
from q2_types.plugin_setup import plugin


class ProfileHmmBinaryFileFmt(model.BinaryFileFormat):
def _validate_(self, level):
pass


class PressedProfileHmmsDirectoryFmt(model.DirectoryFormat):
"""
The <hmmfile>.h3m file contains the profile HMMs
and their annotation in a binary format. The <hmmfile>.h3i file is an
SSI index for the <hmmfile>.h3m file. The <hmmfile>.h3f file contains
precomputed data structures for the fast heuristic filter
(the MSV filter). The <hmmfile>.h3p file contains precomputed data
structures for the rest of each profile.
"""
h3m = model.File(r'.*\.hmm\.h3m', format=ProfileHmmBinaryFileFmt)
h3i = model.File(r'.*\.hmm\.h3i', format=ProfileHmmBinaryFileFmt)
h3f = model.File(r'.*\.hmm\.h3f', format=ProfileHmmBinaryFileFmt)
h3p = model.File(r'.*\.hmm\.h3p', format=ProfileHmmBinaryFileFmt)


class ProfileHmmFileFmt(model.TextFileFormat):
def _validate_(self, level: str):
"""
Check http://eddylab.org/software/hmmer/Userguide.pdf
section "HMMER profile HMM files" for full description of
hmm file format.
"""
parse_n_profiles = {"min": 3, "max": None}[level]
tolerance = 0.0001

with HMMFile(str(self)) as hmm_file:
try:
hmm_profiles = list(hmm_file)
except AlphabetMismatch:
raise ValidationError(
"Found profiles with alphabet different from "
f"'{self.alphabet}'"
)

if len(hmm_profiles) > 1 and self.single:
raise ValidationError(
f"Expected 1 profile, found {len(hmm_profiles)}."
)

for hmm_profile in hmm_profiles[:parse_n_profiles]:
hmm_profile.validate(tolerance=tolerance)

if hmm_profile.alphabet.type.lower() != self.alphabet:
raise ValidationError(
"Found profile with alphabet "
f"{hmm_profile.alphabet.type.lower()}\n"
f"Expected alphabet: {self.alphabet}."
)


class ProteinProfileHmmFileFmt(ProfileHmmFileFmt):
alphabet = "amino"


class ProteinSingleProfileHmmFileFmt(ProteinProfileHmmFileFmt):
single = True


class ProteinMultipleProfileHmmFileFmt(ProteinProfileHmmFileFmt):
single = False


class DNAProfileHmmFileFmt(ProfileHmmFileFmt):
alphabet = "dna"


class DNASingleProfileHmmFileFmt(DNAProfileHmmFileFmt):
single = True


class DNAMultipleProfileHmmFileFmt(DNAProfileHmmFileFmt):
single = False


class RNAProfileHmmFileFmt(ProfileHmmFileFmt):
alphabet = "rna"


class RNASingleProfileHmmFileFmt(RNAProfileHmmFileFmt):
single = True


class RNAMultipleProfileHmmFileFmt(RNAProfileHmmFileFmt):
single = False


class ProteinSingleProfileHmmDirectoryFmt(model.DirectoryFormat):
profile = model.File(r'.*\.hmm', format=ProteinSingleProfileHmmFileFmt)


class ProteinMultipleProfileHmmDirectoryFmt(model.DirectoryFormat):
profiles = model.File(r'.*\.hmm', format=ProteinMultipleProfileHmmFileFmt)


class DNASingleProfileHmmDirectoryFmt(model.DirectoryFormat):
profile = model.File(r'.*\.hmm', format=DNASingleProfileHmmFileFmt)


class DNAMultipleProfileHmmDirectoryFmt(model.DirectoryFormat):
profiles = model.File(r'.*\.hmm', format=DNAMultipleProfileHmmFileFmt)


class RNASingleProfileHmmDirectoryFmt(model.DirectoryFormat):
profile = model.File(r'.*\.hmm', format=RNASingleProfileHmmFileFmt)


class RNAMultipleProfileHmmDirectoryFmt(model.DirectoryFormat):
profiles = model.File(r'.*\.hmm', format=RNAMultipleProfileHmmFileFmt)


plugin.register_formats(
PressedProfileHmmsDirectoryFmt,
ProteinSingleProfileHmmDirectoryFmt,
ProteinMultipleProfileHmmDirectoryFmt,
DNASingleProfileHmmDirectoryFmt,
DNAMultipleProfileHmmDirectoryFmt,
RNASingleProfileHmmDirectoryFmt,
RNAMultipleProfileHmmDirectoryFmt
)
136 changes: 136 additions & 0 deletions q2_types/profile_hmms/_type.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,136 @@
# ----------------------------------------------------------------------------
# Copyright (c) 2023, QIIME 2 development team.
#
# Distributed under the terms of the Modified BSD License.
#
# The full license is in the file LICENSE, distributed with this software.
# ----------------------------------------------------------------------------
from qiime2.plugin import SemanticType
from q2_types.plugin_setup import plugin
from q2_types.profile_hmms._format import (
PressedProfileHmmsDirectoryFmt,
DNASingleProfileHmmDirectoryFmt,
DNAMultipleProfileHmmDirectoryFmt,
RNASingleProfileHmmDirectoryFmt,
RNAMultipleProfileHmmDirectoryFmt,
ProteinSingleProfileHmmDirectoryFmt,
ProteinMultipleProfileHmmDirectoryFmt
)


ProfileHMM = SemanticType('ProfileHMM', field_names='type')
Sann5 marked this conversation as resolved.
Show resolved Hide resolved
SingleProtein = SemanticType(
'SingleProtein', variant_of=ProfileHMM.field['type']
)
SingleDNA = SemanticType(
'SingleDNA', variant_of=ProfileHMM.field['type']
)
SingleRNA = SemanticType(
'SingleRNA', variant_of=ProfileHMM.field['type']
)
MultipleProtein = SemanticType(
'MultipleProtein', variant_of=ProfileHMM.field['type']
)
MultipleDNA = SemanticType(
'MultipleDNA', variant_of=ProfileHMM.field['type']
)
MultipleRNA = SemanticType(
'MultipleRNA', variant_of=ProfileHMM.field['type']
)
PressedProtein = SemanticType(
'PressedProtein', variant_of=ProfileHMM.field['type']
)
PressedDNA = SemanticType(
'PressedDNA', variant_of=ProfileHMM.field['type']
)
PressedRNA = SemanticType(
'PressedRNA', variant_of=ProfileHMM.field['type']
)

plugin.register_semantic_types(
ProfileHMM,
SingleProtein, SingleDNA, SingleRNA,
MultipleProtein, MultipleDNA, MultipleRNA,
PressedProtein, PressedDNA, PressedRNA
)

plugin.register_artifact_class(
ProfileHMM[PressedProtein],
directory_format=PressedProfileHmmsDirectoryFmt,
description=(
"A collection of profile Hidden Markov Models for amino acid "
"sequences in binary format and indexed."
)
)

plugin.register_artifact_class(
ProfileHMM[PressedDNA],
directory_format=PressedProfileHmmsDirectoryFmt,
description=(
"A collection of profile Hidden Markov Models for DNA "
"sequences in binary format and indexed."
)
)

plugin.register_artifact_class(
ProfileHMM[PressedRNA],
directory_format=PressedProfileHmmsDirectoryFmt,
description=(
"A collection of profile Hidden Markov Models for RNA "
"sequences in binary format and indexed."
)
)

plugin.register_artifact_class(
ProfileHMM[SingleProtein],
directory_format=ProteinSingleProfileHmmDirectoryFmt,
description=(
"One single profile Hidden Markov Model representing a group "
"of related proteins."
)
)

plugin.register_artifact_class(
ProfileHMM[SingleDNA],
directory_format=DNASingleProfileHmmDirectoryFmt,
description=(
"One single profile Hidden Markov Model representing a group "
"of related DNA sequences."
)
)

plugin.register_artifact_class(
ProfileHMM[SingleRNA],
directory_format=RNASingleProfileHmmDirectoryFmt,
description=(
"One single profile Hidden Markov Model representing a group "
"of related RNA sequences."
)
)

plugin.register_artifact_class(
ProfileHMM[MultipleProtein],
directory_format=ProteinMultipleProfileHmmDirectoryFmt,
description=(
"A collection of profile Hidden Markov Models, "
"each representing a group of related proteins."
)
)

plugin.register_artifact_class(
ProfileHMM[MultipleDNA],
directory_format=DNAMultipleProfileHmmDirectoryFmt,
description=(
"A collection of profile Hidden Markov Models, "
"each representing a group of related DNA sequences."
)
)

plugin.register_artifact_class(
ProfileHMM[MultipleRNA],
directory_format=RNAMultipleProfileHmmDirectoryFmt,
description=(
"A collection of profile Hidden Markov Models, "
"each representing a group of related RNA sequences."
)
)
7 changes: 7 additions & 0 deletions q2_types/profile_hmms/tests/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
# ----------------------------------------------------------------------------
# Copyright (c) 2023, QIIME 2 development team.
#
# Distributed under the terms of the Modified BSD License.
#
# The full license is in the file LICENSE, distributed with this software.
# ----------------------------------------------------------------------------
Empty file.
Empty file.
Empty file.
Empty file.
Loading
Loading