From 04dd0da2a5a9724a2f7cea194a05e65992c27d23 Mon Sep 17 00:00:00 2001 From: Michal Ziemski Date: Wed, 17 Jul 2024 17:54:21 +0200 Subject: [PATCH] MAINT: refactor/clean up GenomeData types (#338) --- q2_types/feature_data/_type.py | 4 +- q2_types/feature_data_mag/__init__.py | 13 +- q2_types/feature_data_mag/_format.py | 28 ----- q2_types/feature_data_mag/_transformer.py | 88 +------------- q2_types/feature_data_mag/_type.py | 27 +--- .../feature_data_mag/tests/test_format.py | 38 +----- .../tests/test_transformers.py | 57 +-------- q2_types/feature_data_mag/tests/test_type.py | 29 +---- q2_types/genome_data/__init__.py | 12 +- q2_types/genome_data/_format.py | 40 +++++- q2_types/genome_data/_transformer.py | 115 +++++++++++++++--- q2_types/genome_data/_type.py | 41 +++++-- .../tests/data/genome-sequences/sequence1.fa | 5 + .../data/genome-sequences/sequence2.fasta | 4 + .../test_output.emapper.annotations | 0 .../test_output.emapper.orthologs | 0 ...4a19-8b93-e76a15ce107f.emapper.annotations | 0 ...4325-83a1-ea406f3e19ad.emapper.annotations | 0 .../test_output1.emapper.annotations | 0 .../test_output2.emapper.annotations | 0 .../test_output.emapper.annotations | 0 q2_types/genome_data/tests/test_format.py | 40 ++++++ .../genome_data/tests/test_transformer.py | 59 ++++++++- q2_types/genome_data/tests/test_type.py | 41 +++++-- q2_types/per_sample_sequences/_type.py | 7 +- .../per_sample_sequences/tests/test_type.py | 9 +- setup.py | 7 +- 27 files changed, 336 insertions(+), 328 deletions(-) create mode 100644 q2_types/genome_data/tests/data/genome-sequences/sequence1.fa create mode 100644 q2_types/genome_data/tests/data/genome-sequences/sequence2.fasta rename q2_types/{feature_data_mag/tests/data/good_ortholog_annotation => genome_data/tests/data/ortholog-annotation-extra}/test_output.emapper.annotations (100%) rename q2_types/{feature_data_mag/tests/data/ortholog_annotation_extra => genome_data/tests/data/ortholog-annotation-extra}/test_output.emapper.orthologs (100%) rename q2_types/{feature_data_mag/tests/data/ortholog_annotation_mags => genome_data/tests/data/ortholog-annotation-mags}/d33dbcbe-eedd-4a19-8b93-e76a15ce107f.emapper.annotations (100%) rename q2_types/{feature_data_mag/tests/data/ortholog_annotation_mags => genome_data/tests/data/ortholog-annotation-mags}/e6408340-b308-4325-83a1-ea406f3e19ad.emapper.annotations (100%) rename q2_types/{feature_data_mag/tests/data/ortholog_annotation_samples => genome_data/tests/data/ortholog-annotation-samples}/test_output1.emapper.annotations (100%) rename q2_types/{feature_data_mag/tests/data/ortholog_annotation_samples => genome_data/tests/data/ortholog-annotation-samples}/test_output2.emapper.annotations (100%) rename q2_types/{feature_data_mag/tests/data/ortholog_annotation_extra => genome_data/tests/data/ortholog-annotation}/test_output.emapper.annotations (100%) diff --git a/q2_types/feature_data/_type.py b/q2_types/feature_data/_type.py index 7d65baa3..6eed78a9 100644 --- a/q2_types/feature_data/_type.py +++ b/q2_types/feature_data/_type.py @@ -19,7 +19,6 @@ RNASequencesDirectoryFormat, AlignedRNASequencesDirectoryFormat, PairedRNASequencesDirectoryFormat, BLAST6DirectoryFormat, SequenceCharacteristicsDirectoryFormat) -from q2_types.sample_data import SampleData FeatureData = SemanticType('FeatureData', field_names='type') @@ -52,8 +51,7 @@ variant_of=FeatureData.field['type']) BLAST6 = SemanticType('BLAST6', - variant_of=[FeatureData.field['type'], - SampleData.field['type']]) + variant_of=FeatureData.field['type']) SequenceCharacteristics = SemanticType('SequenceCharacteristics', variant_of=FeatureData.field['type']) diff --git a/q2_types/feature_data_mag/__init__.py b/q2_types/feature_data_mag/__init__.py index 3701e8bc..c79d18ef 100644 --- a/q2_types/feature_data_mag/__init__.py +++ b/q2_types/feature_data_mag/__init__.py @@ -8,19 +8,12 @@ import importlib -from ._format import ( - MAGSequencesDirFmt, - OrthologAnnotationDirFmt, - OrthologFileFmt - ) +from ._format import MAGSequencesDirFmt -from ._type import MAG, NOG, OG, KEGG, Contig +from ._type import MAG, Contig from ._transformer import MAGIterator -__all__ = [ - 'MAG', 'MAGSequencesDirFmt', 'MAGIterator', 'NOG', 'OG', 'KEGG', - 'OrthologAnnotationDirFmt', 'OrthologFileFmt', 'Contig' - ] +__all__ = ['MAG', 'MAGSequencesDirFmt', 'MAGIterator', 'Contig'] importlib.import_module('q2_types.feature_data_mag._format') importlib.import_module('q2_types.feature_data_mag._transformer') diff --git a/q2_types/feature_data_mag/_format.py b/q2_types/feature_data_mag/_format.py index 935b4304..af7b02dd 100644 --- a/q2_types/feature_data_mag/_format.py +++ b/q2_types/feature_data_mag/_format.py @@ -9,7 +9,6 @@ import re from q2_types.feature_data import DNAFASTAFormat -from q2_types.genome_data._format import OrthologFileFmt from qiime2.plugin import model from ..plugin_setup import plugin @@ -62,30 +61,3 @@ def feature_dict(self, relative=False): plugin.register_formats(MAGSequencesDirFmt) - - -class OrthologAnnotationDirFmt(model.DirectoryFormat): - pathspec = r'.+\.annotations' - annotations = model.FileCollection(pathspec, format=OrthologFileFmt) - - @annotations.set_path_maker - def annotations_path_maker(self, file_name): - return file_name.split(sep="_")[0] - - def annotation_dict(self, relative=False) -> dict: - ids = {} - for path in self.path.iterdir(): - if re.compile(self.pathspec).match(path.name): - _id = re.sub('.emapper$', '', path.stem) - absolute_path = path.absolute() - if relative: - ids[_id] = str( - absolute_path.relative_to(self.path.absolute()) - ) - else: - ids[_id] = str(absolute_path) - - return dict(sorted(ids.items())) - - -plugin.register_formats(OrthologAnnotationDirFmt) diff --git a/q2_types/feature_data_mag/_transformer.py b/q2_types/feature_data_mag/_transformer.py index cfb10db7..af7925ea 100644 --- a/q2_types/feature_data_mag/_transformer.py +++ b/q2_types/feature_data_mag/_transformer.py @@ -5,22 +5,17 @@ # # The full license is in the file LICENSE, distributed with this software. # ---------------------------------------------------------------------------- -from io import StringIO - -import uuid - import collections.abc import glob import os.path -import qiime2 from itertools import repeat import pandas as pd import skbio from q2_types.feature_data._transformer import _fastaformats_to_series -from . import MAGSequencesDirFmt, OrthologAnnotationDirFmt +from . import MAGSequencesDirFmt from ..plugin_setup import plugin CONSTRUCTORS = { @@ -94,84 +89,3 @@ def _5(data: MAGIterator) -> MAGSequencesDirFmt: with open(fp, 'a') as fin: skbio.io.write(seq, format='fasta', into=fin) return result - - -def _is_valid_uuid4(uuid_string: str): - """ - Check if a given string is a valid UUID version 4. - - This function checks if the provided string is a valid UUID version 4. - The only purpose of doing that here is to identify whether provided - string was a MAG ID (UUID4) or a sample ID. For that reason, we don't - print any statements or raise any exceptions. - - Parameters: - uuid_string (str): The string to check for UUID version 4 validity. - - Returns: - bool: True if the string is a valid UUID version 4, False otherwise. - """ - try: - uuid_obj = uuid.UUID(uuid_string, version=4) - except ValueError: - return False - - return str(uuid_obj) == uuid_string - - -def _reshuffle_columns(df: pd.DataFrame): - if 'MAG' in df.columns: - col = 'MAG' - elif 'Sample' in df.columns: - col = 'Sample' - else: - return df - - cols = df.columns.tolist() - cols.remove(col) - cols.insert(0, col) - df = df[cols] - return df - - -def _annotations_to_dataframe( - data: OrthologAnnotationDirFmt -) -> pd.DataFrame: - annotations = data.annotation_dict() - dfs = [] - for _id, path in annotations.items(): - # we need to ignore the ## comments at - # the beginning and end of the file - with open(path, 'r') as f: - lines = [line for line in f if not line.startswith('##')] - - df = pd.read_csv(StringIO('\n'.join(lines)), sep='\t', index_col=0) - if _is_valid_uuid4(_id): - df['MAG'] = _id - else: - df['Sample'] = _id - - dfs.append(df) - - df = pd.concat(dfs) - - # to satisfy QIIME2's particular requirements - df.reset_index(drop=False, inplace=True) - df.index = df.index.astype(str) - df.index.rename('id', inplace=True) - - # reshuffle columns for nicer display - df = _reshuffle_columns(df) - - return df - - -@plugin.register_transformer -def _7(data: OrthologAnnotationDirFmt) -> pd.DataFrame: - return _annotations_to_dataframe(data) - - -@plugin.register_transformer -def _8(data: OrthologAnnotationDirFmt) -> qiime2.Metadata: - annotations = _annotations_to_dataframe(data) - return qiime2.Metadata(annotations) diff --git a/q2_types/feature_data_mag/_type.py b/q2_types/feature_data_mag/_type.py index 29c7b11c..679aae3e 100644 --- a/q2_types/feature_data_mag/_type.py +++ b/q2_types/feature_data_mag/_type.py @@ -8,9 +8,7 @@ from q2_types.feature_data import FeatureData -from q2_types.feature_data_mag._format import ( - MAGSequencesDirFmt, OrthologAnnotationDirFmt - ) +from q2_types.feature_data_mag._format import MAGSequencesDirFmt from qiime2.core.type import SemanticType from ..bowtie2 import Bowtie2IndexDirFmt @@ -34,29 +32,6 @@ artifact_format=ContigSequencesDirFmt ) -NOG = SemanticType('NOG', variant_of=FeatureData.field['type']) - -plugin.register_semantic_types(NOG) -plugin.register_artifact_class( - FeatureData[NOG], - directory_format=OrthologAnnotationDirFmt) - - -OG = SemanticType('OG', variant_of=FeatureData.field['type']) - -plugin.register_semantic_types(OG) -plugin.register_artifact_class( - FeatureData[OG], - directory_format=OrthologAnnotationDirFmt) - - -KEGG = SemanticType('KEGG', variant_of=FeatureData.field['type']) - -plugin.register_semantic_types(KEGG) -plugin.register_artifact_class( - FeatureData[KEGG], - directory_format=OrthologAnnotationDirFmt) - plugin.register_semantic_type_to_format( FeatureData[SingleBowtie2Index], artifact_format=Bowtie2IndexDirFmt diff --git a/q2_types/feature_data_mag/tests/test_format.py b/q2_types/feature_data_mag/tests/test_format.py index 719ecf41..7f868c97 100644 --- a/q2_types/feature_data_mag/tests/test_format.py +++ b/q2_types/feature_data_mag/tests/test_format.py @@ -11,11 +11,8 @@ import unittest from qiime2.plugin.testing import TestPluginBase -from qiime2.plugin import ValidationError -from q2_types.feature_data_mag._format import ( - MAGSequencesDirFmt, OrthologAnnotationDirFmt, - ) +from q2_types.feature_data_mag._format import MAGSequencesDirFmt class TestFormats(TestPluginBase): @@ -60,39 +57,6 @@ def test_mag_dirfmt_feature_dict(self): } self.assertDictEqual(obs, exp) - def test_ortholog_annotation_dir_fmt_passing(self): - dirpath = self.get_data_path('good_ortholog_annotation') - fmt_obj = OrthologAnnotationDirFmt(dirpath, mode='r') - fmt_obj.validate() - - def test_ortholog_annotation_dir_fmt_fails_extra_file(self): - dirpath = self.get_data_path('ortholog_annotation_extra') - fmt_obj = OrthologAnnotationDirFmt(dirpath, mode='r') - - with self.assertRaisesRegex(ValidationError, "Unrecognized file"): - fmt_obj.validate() - - def test_ortholog_annotations_annot_dict(self): - annotations = OrthologAnnotationDirFmt( - self.get_data_path('ortholog_annotation_samples'), mode='r' - ) - - obs = annotations.annotation_dict() - exp = { - 'test_output1': - str(annotations.path / 'test_output1.emapper.annotations'), - 'test_output2': - str(annotations.path / 'test_output2.emapper.annotations') - } - self.assertDictEqual(obs, exp) - - obs = annotations.annotation_dict(relative=True) - exp = { - 'test_output1': 'test_output1.emapper.annotations', - 'test_output2': 'test_output2.emapper.annotations' - } - self.assertDictEqual(obs, exp) - if __name__ == '__main__': unittest.main() diff --git a/q2_types/feature_data_mag/tests/test_transformers.py b/q2_types/feature_data_mag/tests/test_transformers.py index 03b2f7f0..810560cc 100644 --- a/q2_types/feature_data_mag/tests/test_transformers.py +++ b/q2_types/feature_data_mag/tests/test_transformers.py @@ -7,7 +7,6 @@ # ---------------------------------------------------------------------------- import glob -import qiime2 import unittest from itertools import repeat @@ -17,11 +16,9 @@ from skbio import DNA from q2_types.feature_data_mag import ( - MAGSequencesDirFmt, MAGIterator, OrthologAnnotationDirFmt -) -from q2_types.feature_data_mag._transformer import ( - _get_filename, _annotations_to_dataframe + MAGSequencesDirFmt, MAGIterator ) +from q2_types.feature_data_mag._transformer import _get_filename class TestTransformers(TestPluginBase): @@ -132,56 +129,6 @@ def test_mag_iterator_to_mag_sequences_dir_fmt(self): obs_seqs = self.read_seqs_into_dict(str(obs)) self.assertDictEqual(self.mags_fa, obs_seqs) - def test_annotations_to_dataframe_samples(self): - annotations = OrthologAnnotationDirFmt( - self.get_data_path('ortholog_annotation_samples'), - mode='r' - ) - obs = _annotations_to_dataframe(annotations) - self.assertEqual((11, 22), obs.shape) - self.assertTrue(obs.columns[0] == "Sample") - self.assertTrue(obs.index.is_unique) - self.assertEqual("id", obs.index.name) - - def test_annotations_to_dataframe_mags(self): - annotations = OrthologAnnotationDirFmt( - self.get_data_path('ortholog_annotation_mags'), - mode='r' - ) - obs = _annotations_to_dataframe(annotations) - self.assertEqual((11, 22), obs.shape) - self.assertTrue(obs.columns[0] == "MAG") - self.assertTrue(obs.index.is_unique) - self.assertEqual("id", obs.index.name) - - def test_annotations_to_df_transformer(self): - annotations = OrthologAnnotationDirFmt( - self.get_data_path('ortholog_annotation_mags'), - mode='r' - ) - transformer = self.get_transformer( - OrthologAnnotationDirFmt, pd.DataFrame - ) - - obs = transformer(annotations) - self.assertIsInstance(obs, pd.DataFrame) - self.assertEqual((11, 22), obs.shape) - self.assertTrue(obs.columns[0] == "MAG") - self.assertTrue(obs.index.is_unique) - self.assertEqual("id", obs.index.name) - - def test_annotations_to_metadata_transformer(self): - annotations = OrthologAnnotationDirFmt( - self.get_data_path('ortholog_annotation_mags'), - mode='r' - ) - transformer = self.get_transformer( - OrthologAnnotationDirFmt, qiime2.Metadata - ) - - obs = transformer(annotations) - self.assertIsInstance(obs, qiime2.Metadata) - if __name__ == '__main__': unittest.main() diff --git a/q2_types/feature_data_mag/tests/test_type.py b/q2_types/feature_data_mag/tests/test_type.py index c8dfc324..0010dd47 100644 --- a/q2_types/feature_data_mag/tests/test_type.py +++ b/q2_types/feature_data_mag/tests/test_type.py @@ -12,10 +12,7 @@ from q2_types.feature_data import FeatureData from qiime2.plugin.testing import TestPluginBase -from q2_types.feature_data_mag import ( - MAG, MAGSequencesDirFmt, OrthologAnnotationDirFmt, - NOG, OG, KEGG, Contig -) +from q2_types.feature_data_mag import MAG, MAGSequencesDirFmt, Contig from q2_types.per_sample_sequences import ( ContigSequencesDirFmt, SingleBowtie2Index ) @@ -42,30 +39,6 @@ def test_contig_semantic_type_to_format_registration(self): ContigSequencesDirFmt ) - def test_nog_type_registration(self): - self.assertRegisteredSemanticType(NOG) - - def test_og_type_registration(self): - self.assertRegisteredSemanticType(OG) - - def test_kegg_type_registration(self): - self.assertRegisteredSemanticType(KEGG) - - def test_nog_registered_to_format(self): - self.assertSemanticTypeRegisteredToFormat( - FeatureData[NOG], - OrthologAnnotationDirFmt) - - def test_og_registered_to_format(self): - self.assertSemanticTypeRegisteredToFormat( - FeatureData[OG], - OrthologAnnotationDirFmt) - - def test_kegg_registered_to_format(self): - self.assertSemanticTypeRegisteredToFormat( - FeatureData[KEGG], - OrthologAnnotationDirFmt) - def test_bowtie_index_semantic_type_to_format_registration(self): self.assertSemanticTypeRegisteredToFormat( FeatureData[SingleBowtie2Index], diff --git a/q2_types/genome_data/__init__.py b/q2_types/genome_data/__init__.py index fd3d285d..2ce5002b 100644 --- a/q2_types/genome_data/__init__.py +++ b/q2_types/genome_data/__init__.py @@ -9,19 +9,21 @@ import importlib from ._format import ( - GenesDirectoryFormat, ProteinsDirectoryFormat, - GFF3Format, LociDirectoryFormat, OrthologFileFmt, SeedOrthologDirFmt, + GenesDirectoryFormat, ProteinsDirectoryFormat, LociDirectoryFormat, + GFF3Format, OrthologFileFmt, SeedOrthologDirFmt, + GenomeSequencesDirectoryFormat, OrthologAnnotationDirFmt, ) from ._transformer import IntervalMetadataIterator from ._type import ( - GenomeData, Genes, Proteins, Loci, Ortholog, BLAST6 + GenomeData, Genes, Proteins, Loci, Orthologs, DNASequence, NOG ) __all__ = [ 'GenomeData', 'Genes', 'Proteins', 'Loci', 'GFF3Format', 'GenesDirectoryFormat', 'ProteinsDirectoryFormat', 'LociDirectoryFormat', - 'IntervalMetadataIterator', 'OrthologFileFmt', 'Ortholog', - 'SeedOrthologDirFmt', 'BLAST6', + 'IntervalMetadataIterator', 'OrthologFileFmt', 'Orthologs', + 'SeedOrthologDirFmt', 'GenomeSequencesDirectoryFormat', 'DNASequence', + 'OrthologAnnotationDirFmt', 'NOG' ] importlib.import_module('q2_types.genome_data._format') diff --git a/q2_types/genome_data/_format.py b/q2_types/genome_data/_format.py index 06256d80..f5e9c8a6 100644 --- a/q2_types/genome_data/_format.py +++ b/q2_types/genome_data/_format.py @@ -5,6 +5,7 @@ # # The full license is in the file LICENSE, distributed with this software. # ---------------------------------------------------------------------------- +import re import qiime2.plugin.model as model from q2_types.feature_data import DNAFASTAFormat, ProteinFASTAFormat @@ -168,9 +169,12 @@ def loci_path_maker(self, genome_id): return '%s.gff' % genome_id -plugin.register_formats( - GenesDirectoryFormat, ProteinsDirectoryFormat, LociDirectoryFormat -) +class GenomeSequencesDirectoryFormat(model.DirectoryFormat): + genomes = model.FileCollection(r'.+\.(fasta|fa)$', format=DNAFASTAFormat) + + @genomes.set_path_maker + def genomes_path_maker(self, genome_id): + return '%s.fasta' % genome_id class SeedOrthologDirFmt(model.DirectoryFormat): @@ -183,4 +187,32 @@ def seed_ortholog_pathmaker(self, sample_name): return str(sample_name.split(sep=".")[0] + ".seed_orthologs") -plugin.register_formats(OrthologFileFmt, SeedOrthologDirFmt) +class OrthologAnnotationDirFmt(model.DirectoryFormat): + pathspec = r'.+\.annotations' + annotations = model.FileCollection(pathspec, format=OrthologFileFmt) + + @annotations.set_path_maker + def annotations_path_maker(self, file_name): + return file_name.split(sep="_")[0] + + def annotation_dict(self, relative=False) -> dict: + ids = {} + for path in self.path.iterdir(): + if re.compile(self.pathspec).match(path.name): + _id = re.sub('.emapper$', '', path.stem) + absolute_path = path.absolute() + if relative: + ids[_id] = str( + absolute_path.relative_to(self.path.absolute()) + ) + else: + ids[_id] = str(absolute_path) + + return dict(sorted(ids.items())) + + +plugin.register_formats( + GenesDirectoryFormat, ProteinsDirectoryFormat, LociDirectoryFormat, + GenomeSequencesDirectoryFormat, OrthologFileFmt, SeedOrthologDirFmt, + OrthologAnnotationDirFmt +) diff --git a/q2_types/genome_data/_transformer.py b/q2_types/genome_data/_transformer.py index 1d449ed9..5a3c2515 100644 --- a/q2_types/genome_data/_transformer.py +++ b/q2_types/genome_data/_transformer.py @@ -8,13 +8,17 @@ import collections.abc import os +import uuid +from io import StringIO import pandas as pd +import qiime2 import skbio from skbio.io import read from . import ( - GenesDirectoryFormat, ProteinsDirectoryFormat, GFF3Format, OrthologFileFmt + GenesDirectoryFormat, ProteinsDirectoryFormat, GFF3Format, + OrthologFileFmt, OrthologAnnotationDirFmt ) from ..plugin_setup import plugin @@ -25,20 +29,6 @@ } -@plugin.register_transformer -def _8(ortholog_file: OrthologFileFmt) -> pd.DataFrame: - - seed_ortholog_column_names = ['qseqid', 'sseqid', 'evalue', 'bitscore', - 'qstart', 'qend', 'sstart', 'send', 'pident', - 'qcov', 'scov'] - - return pd.read_csv(ortholog_file.path, sep="\t", - names=seed_ortholog_column_names, - header='infer', - comment="#" - ) - - def _series_to_fasta(series, ff, seq_type='DNA'): fp = os.path.join(ff.path, f'{series.name}.fasta') with open(fp, 'w') as fh: @@ -104,3 +94,98 @@ def _7(data: IntervalMetadataIterator) -> GFF3Format: for _id, im in data: im.write(fh, format='gff3', seq_id=_id) return ff + + +def _is_valid_uuid4(uuid_string: str): + """ + Check if a given string is a valid UUID version 4. + + This function checks if the provided string is a valid UUID version 4. + The only purpose of doing that here is to identify whether provided + string was a MAG ID (UUID4) or a sample ID. For that reason, we don't + print any statements or raise any exceptions. + + Parameters: + uuid_string (str): The string to check for UUID version 4 validity. + + Returns: + bool: True if the string is a valid UUID version 4, False otherwise. + """ + try: + uuid_obj = uuid.UUID(uuid_string, version=4) + except ValueError: + return False + + return str(uuid_obj) == uuid_string + + +def _reshuffle_columns(df: pd.DataFrame): + if 'MAG' in df.columns: + col = 'MAG' + elif 'Sample' in df.columns: + col = 'Sample' + else: + return df + + cols = df.columns.tolist() + cols.remove(col) + cols.insert(0, col) + df = df[cols] + return df + + +def _annotations_to_dataframe( + data: OrthologAnnotationDirFmt +) -> pd.DataFrame: + annotations = data.annotation_dict() + dfs = [] + for _id, path in annotations.items(): + # we need to ignore the ## comments at + # the beginning and end of the file + with open(path, 'r') as f: + lines = [line for line in f if not line.startswith('##')] + + df = pd.read_csv(StringIO('\n'.join(lines)), sep='\t', index_col=0) + if _is_valid_uuid4(_id): + df['MAG'] = _id + else: + df['Sample'] = _id + + dfs.append(df) + + df = pd.concat(dfs) + + # to satisfy QIIME2's particular requirements + df.reset_index(drop=False, inplace=True) + df.index = df.index.astype(str) + df.index.rename('id', inplace=True) + + # reshuffle columns for nicer display + df = _reshuffle_columns(df) + + return df + + +@plugin.register_transformer +def _8(ortholog_file: OrthologFileFmt) -> pd.DataFrame: + + seed_ortholog_column_names = ['qseqid', 'sseqid', 'evalue', 'bitscore', + 'qstart', 'qend', 'sstart', 'send', 'pident', + 'qcov', 'scov'] + + return pd.read_csv(ortholog_file.path, sep="\t", + names=seed_ortholog_column_names, + header='infer', + comment="#" + ) + + +@plugin.register_transformer +def _9(data: OrthologAnnotationDirFmt) -> pd.DataFrame: + return _annotations_to_dataframe(data) + + +@plugin.register_transformer +def _10(data: OrthologAnnotationDirFmt) -> qiime2.Metadata: + annotations = _annotations_to_dataframe(data) + return qiime2.Metadata(annotations) diff --git a/q2_types/genome_data/_type.py b/q2_types/genome_data/_type.py index ad60cfd5..2176d896 100644 --- a/q2_types/genome_data/_type.py +++ b/q2_types/genome_data/_type.py @@ -7,22 +7,29 @@ # ---------------------------------------------------------------------------- from qiime2.plugin import SemanticType -from q2_types.feature_data import BLAST6 from . import ( GenesDirectoryFormat, ProteinsDirectoryFormat, LociDirectoryFormat, - SeedOrthologDirFmt, + SeedOrthologDirFmt, GenomeSequencesDirectoryFormat, + OrthologAnnotationDirFmt, ) from ..plugin_setup import plugin +from ..sample_data import SampleData -GenomeData = SemanticType('GenomeData', field_names='type', - field_members={'type': BLAST6}) +GenomeData = SemanticType('GenomeData', field_names='type') Genes = SemanticType('Genes', variant_of=GenomeData.field['type']) Proteins = SemanticType('Proteins', variant_of=GenomeData.field['type']) Loci = SemanticType('Loci', variant_of=GenomeData.field['type']) -Ortholog = SemanticType('Ortholog', variant_of=GenomeData.field['type']) +Orthologs = SemanticType('Orthologs', + variant_of=[GenomeData.field['type'], + SampleData.field['type']]) +NOG = SemanticType('NOG', variant_of=[GenomeData.field['type'], + SampleData.field['type']]) +DNASequence = SemanticType('DNASequence', variant_of=GenomeData.field['type']) -plugin.register_semantic_types(GenomeData, Genes, Proteins, Loci) +plugin.register_semantic_types( + GenomeData, Genes, Proteins, Loci, DNASequence, Orthologs, NOG +) plugin.register_semantic_type_to_format( GenomeData[Genes], @@ -40,6 +47,26 @@ ) plugin.register_semantic_type_to_format( - GenomeData[BLAST6], + GenomeData[Orthologs], artifact_format=SeedOrthologDirFmt ) + +plugin.register_semantic_type_to_format( + GenomeData[DNASequence], + artifact_format=GenomeSequencesDirectoryFormat +) + +plugin.register_semantic_type_to_format( + GenomeData[NOG], + artifact_format=OrthologAnnotationDirFmt +) + +plugin.register_semantic_type_to_format( + SampleData[Orthologs], + artifact_format=SeedOrthologDirFmt +) + +plugin.register_artifact_class( + SampleData[NOG], + directory_format=OrthologAnnotationDirFmt +) diff --git a/q2_types/genome_data/tests/data/genome-sequences/sequence1.fa b/q2_types/genome_data/tests/data/genome-sequences/sequence1.fa new file mode 100644 index 00000000..ae11e361 --- /dev/null +++ b/q2_types/genome_data/tests/data/genome-sequences/sequence1.fa @@ -0,0 +1,5 @@ +>genome1 some_description1 +GGCAGATTCCCCCTAGACCCGCCCGCACCATGGTCAGGCATGCCCCTCCTCATCGCTGGGCACAGCCCAGAGGGT +ATAAACAGTGCTGGAGGCCCACTGCACTCACCGCACCCGGCCAATTTTTGTGTTTTTAGTAGAGACTAAATACCA +TATAGTGAACACCTAAGAGCACCCGGCCAATTTTTGTGTTTTTAGTAGAGAAGATTCCCCCTAGACCCGCCCGCT +ATAGTGAACACCTAAGAACTGGAGGCGGGGGGCCTTGG diff --git a/q2_types/genome_data/tests/data/genome-sequences/sequence2.fasta b/q2_types/genome_data/tests/data/genome-sequences/sequence2.fasta new file mode 100644 index 00000000..75e1d89f --- /dev/null +++ b/q2_types/genome_data/tests/data/genome-sequences/sequence2.fasta @@ -0,0 +1,4 @@ +>genome2 some_description2 +ATGGTCAGGCATGCCCCTCCTCATCGCTGGGCGGCAGATTCCCCCTAGACCCGCCCGCACCACAGCCCAGAGGGT +ATAAACAGTGCTGGAGGCAATACCATATAGTGAACACCTAACCACTGCACTCACCGCACCCGGCCAATTTTTGTG +TTTTTAGTAGAGACTAGACGGGGGGCCTTGG diff --git a/q2_types/feature_data_mag/tests/data/good_ortholog_annotation/test_output.emapper.annotations b/q2_types/genome_data/tests/data/ortholog-annotation-extra/test_output.emapper.annotations similarity index 100% rename from q2_types/feature_data_mag/tests/data/good_ortholog_annotation/test_output.emapper.annotations rename to q2_types/genome_data/tests/data/ortholog-annotation-extra/test_output.emapper.annotations diff --git a/q2_types/feature_data_mag/tests/data/ortholog_annotation_extra/test_output.emapper.orthologs b/q2_types/genome_data/tests/data/ortholog-annotation-extra/test_output.emapper.orthologs similarity index 100% rename from q2_types/feature_data_mag/tests/data/ortholog_annotation_extra/test_output.emapper.orthologs rename to q2_types/genome_data/tests/data/ortholog-annotation-extra/test_output.emapper.orthologs diff --git a/q2_types/feature_data_mag/tests/data/ortholog_annotation_mags/d33dbcbe-eedd-4a19-8b93-e76a15ce107f.emapper.annotations b/q2_types/genome_data/tests/data/ortholog-annotation-mags/d33dbcbe-eedd-4a19-8b93-e76a15ce107f.emapper.annotations similarity index 100% rename from q2_types/feature_data_mag/tests/data/ortholog_annotation_mags/d33dbcbe-eedd-4a19-8b93-e76a15ce107f.emapper.annotations rename to q2_types/genome_data/tests/data/ortholog-annotation-mags/d33dbcbe-eedd-4a19-8b93-e76a15ce107f.emapper.annotations diff --git a/q2_types/feature_data_mag/tests/data/ortholog_annotation_mags/e6408340-b308-4325-83a1-ea406f3e19ad.emapper.annotations b/q2_types/genome_data/tests/data/ortholog-annotation-mags/e6408340-b308-4325-83a1-ea406f3e19ad.emapper.annotations similarity index 100% rename from q2_types/feature_data_mag/tests/data/ortholog_annotation_mags/e6408340-b308-4325-83a1-ea406f3e19ad.emapper.annotations rename to q2_types/genome_data/tests/data/ortholog-annotation-mags/e6408340-b308-4325-83a1-ea406f3e19ad.emapper.annotations diff --git a/q2_types/feature_data_mag/tests/data/ortholog_annotation_samples/test_output1.emapper.annotations b/q2_types/genome_data/tests/data/ortholog-annotation-samples/test_output1.emapper.annotations similarity index 100% rename from q2_types/feature_data_mag/tests/data/ortholog_annotation_samples/test_output1.emapper.annotations rename to q2_types/genome_data/tests/data/ortholog-annotation-samples/test_output1.emapper.annotations diff --git a/q2_types/feature_data_mag/tests/data/ortholog_annotation_samples/test_output2.emapper.annotations b/q2_types/genome_data/tests/data/ortholog-annotation-samples/test_output2.emapper.annotations similarity index 100% rename from q2_types/feature_data_mag/tests/data/ortholog_annotation_samples/test_output2.emapper.annotations rename to q2_types/genome_data/tests/data/ortholog-annotation-samples/test_output2.emapper.annotations diff --git a/q2_types/feature_data_mag/tests/data/ortholog_annotation_extra/test_output.emapper.annotations b/q2_types/genome_data/tests/data/ortholog-annotation/test_output.emapper.annotations similarity index 100% rename from q2_types/feature_data_mag/tests/data/ortholog_annotation_extra/test_output.emapper.annotations rename to q2_types/genome_data/tests/data/ortholog-annotation/test_output.emapper.annotations diff --git a/q2_types/genome_data/tests/test_format.py b/q2_types/genome_data/tests/test_format.py index d8b639e4..d80abafe 100644 --- a/q2_types/genome_data/tests/test_format.py +++ b/q2_types/genome_data/tests/test_format.py @@ -14,6 +14,7 @@ from .._format import ( GenesDirectoryFormat, ProteinsDirectoryFormat, GFF3Format, LociDirectoryFormat, SeedOrthologDirFmt, OrthologFileFmt, + OrthologAnnotationDirFmt, GenomeSequencesDirectoryFormat, ) @@ -77,6 +78,12 @@ def test_loci_dirfmt_with_suffix(self): fmt.validate() + def test_genome_sequences_dirfmt(self): + dirpath = self.get_data_path('genome-sequences') + fmt = GenomeSequencesDirectoryFormat(dirpath, mode='r') + + fmt.validate() + def test_gff_format_wrong_version(self): filepath = self.get_data_path('loci-invalid/loci-wrong-version.gff') with self.assertRaisesRegex( @@ -139,6 +146,39 @@ def test_gff_format_invalid_phase(self): ValidationError, 'The phase on line 10 was 8.'): GFF3Format(filepath, mode='r').validate() + def test_ortholog_annotation_dir_fmt_passing(self): + dirpath = self.get_data_path('ortholog-annotation') + fmt_obj = OrthologAnnotationDirFmt(dirpath, mode='r') + fmt_obj.validate() + + def test_ortholog_annotation_dir_fmt_fails_extra_file(self): + dirpath = self.get_data_path('ortholog-annotation-extra') + fmt_obj = OrthologAnnotationDirFmt(dirpath, mode='r') + + with self.assertRaisesRegex(ValidationError, "Unrecognized file"): + fmt_obj.validate() + + def test_ortholog_annotations_annot_dict(self): + annotations = OrthologAnnotationDirFmt( + self.get_data_path('ortholog-annotation-samples'), mode='r' + ) + + obs = annotations.annotation_dict() + exp = { + 'test_output1': + str(annotations.path / 'test_output1.emapper.annotations'), + 'test_output2': + str(annotations.path / 'test_output2.emapper.annotations') + } + self.assertDictEqual(obs, exp) + + obs = annotations.annotation_dict(relative=True) + exp = { + 'test_output1': 'test_output1.emapper.annotations', + 'test_output2': 'test_output2.emapper.annotations' + } + self.assertDictEqual(obs, exp) + if __name__ == '__main__': unittest.main() diff --git a/q2_types/genome_data/tests/test_transformer.py b/q2_types/genome_data/tests/test_transformer.py index c5d7cb41..b6fcf981 100644 --- a/q2_types/genome_data/tests/test_transformer.py +++ b/q2_types/genome_data/tests/test_transformer.py @@ -9,13 +9,16 @@ import unittest import pandas as pd +import qiime2 import skbio.io from qiime2.plugin.testing import TestPluginBase from q2_types.genome_data import ( GenesDirectoryFormat, ProteinsDirectoryFormat, GFF3Format, - IntervalMetadataIterator + IntervalMetadataIterator, OrthologAnnotationDirFmt, NOG ) +from q2_types.genome_data._transformer import _annotations_to_dataframe +from q2_types.sample_data import SampleData class TestTransformers(TestPluginBase): @@ -120,6 +123,60 @@ def test_interval_metadata_iterator_to_gff(self): for o, e in zip(obs, input): self.assertEqual(o, e) + def test_annotations_to_dataframe_samples(self): + annotations = OrthologAnnotationDirFmt( + self.get_data_path('ortholog-annotation-samples'), + mode='r' + ) + obs = _annotations_to_dataframe(annotations) + self.assertEqual((11, 22), obs.shape) + self.assertTrue(obs.columns[0] == "Sample") + self.assertTrue(obs.index.is_unique) + self.assertEqual("id", obs.index.name) + + def test_annotations_to_dataframe_mags(self): + annotations = OrthologAnnotationDirFmt( + self.get_data_path('ortholog-annotation-mags'), + mode='r' + ) + obs = _annotations_to_dataframe(annotations) + self.assertEqual((11, 22), obs.shape) + self.assertTrue(obs.columns[0] == "MAG") + self.assertTrue(obs.index.is_unique) + self.assertEqual("id", obs.index.name) + + def test_annotations_to_df_transformer(self): + annotations = OrthologAnnotationDirFmt( + self.get_data_path('ortholog-annotation-mags'), + mode='r' + ) + transformer = self.get_transformer( + OrthologAnnotationDirFmt, pd.DataFrame + ) + + obs = transformer(annotations) + self.assertIsInstance(obs, pd.DataFrame) + self.assertEqual((11, 22), obs.shape) + self.assertTrue(obs.columns[0] == "MAG") + self.assertTrue(obs.index.is_unique) + self.assertEqual("id", obs.index.name) + + def test_annotations_to_metadata_transformer(self): + annotations = OrthologAnnotationDirFmt( + self.get_data_path('ortholog-annotation-mags'), + mode='r' + ) + transformer = self.get_transformer( + OrthologAnnotationDirFmt, qiime2.Metadata + ) + + obs = transformer(annotations) + self.assertIsInstance(obs, qiime2.Metadata) + + def test_nog_registered_to_format(self): + self.assertSemanticTypeRegisteredToFormat( + SampleData[NOG], OrthologAnnotationDirFmt) + if __name__ == '__main__': unittest.main() diff --git a/q2_types/genome_data/tests/test_type.py b/q2_types/genome_data/tests/test_type.py index 9d8d1b69..471b1a9d 100644 --- a/q2_types/genome_data/tests/test_type.py +++ b/q2_types/genome_data/tests/test_type.py @@ -13,18 +13,16 @@ from q2_types.genome_data import ( GenomeData, Genes, Proteins, Loci, GenesDirectoryFormat, - ProteinsDirectoryFormat, LociDirectoryFormat, SeedOrthologDirFmt, BLAST6 - ) + ProteinsDirectoryFormat, LociDirectoryFormat, SeedOrthologDirFmt, + Orthologs, GenomeSequencesDirectoryFormat, DNASequence, NOG, + OrthologAnnotationDirFmt +) +from q2_types.sample_data import SampleData class TestTypes(TestPluginBase): package = 'q2_types.genome_data.tests' - def test_blast6_registered_to_seedorthologdirfmt(self): - self.assertSemanticTypeRegisteredToFormat( - GenomeData[BLAST6], - SeedOrthologDirFmt) - def test_genome_data_semantic_type_registration(self): self.assertRegisteredSemanticType(GenomeData) @@ -37,6 +35,15 @@ def test_proteins_semantic_type_registration(self): def test_loci_semantic_type_registration(self): self.assertRegisteredSemanticType(Loci) + def test_sequence_semantic_type_registration(self): + self.assertRegisteredSemanticType(DNASequence) + + def test_orthologs_semantic_type_registration(self): + self.assertRegisteredSemanticType(Orthologs) + + def test_nog_semantic_type_registration(self): + self.assertRegisteredSemanticType(NOG) + def test_genome_data_genes_to_genes_dir_fmt_registration(self): self.assertSemanticTypeRegisteredToFormat( GenomeData[Genes], GenesDirectoryFormat) @@ -49,6 +56,26 @@ def test_genome_data_loci_to_loci_dir_fmt_registration(self): self.assertSemanticTypeRegisteredToFormat( GenomeData[Loci], LociDirectoryFormat) + def test_genome_data_nog_registered_to_seedorthologdirfmt(self): + self.assertSemanticTypeRegisteredToFormat( + GenomeData[NOG], OrthologAnnotationDirFmt) + + def test_genome_data_orthologs_registered_to_seedorthologdirfmt(self): + self.assertSemanticTypeRegisteredToFormat( + GenomeData[Orthologs], SeedOrthologDirFmt) + + def test_genome_data_sequence_to_genome_dir_fmt_registration(self): + self.assertSemanticTypeRegisteredToFormat( + GenomeData[DNASequence], GenomeSequencesDirectoryFormat) + + def test_sample_data_orthologs_registered_to_seedorthologdirfmt(self): + self.assertSemanticTypeRegisteredToFormat( + SampleData[Orthologs], SeedOrthologDirFmt) + + def test_sample_data_nog_registered_to_seedorthologdirfmt(self): + self.assertSemanticTypeRegisteredToFormat( + SampleData[NOG], OrthologAnnotationDirFmt) + if __name__ == '__main__': unittest.main() diff --git a/q2_types/per_sample_sequences/_type.py b/q2_types/per_sample_sequences/_type.py index 316f97bb..e671702c 100644 --- a/q2_types/per_sample_sequences/_type.py +++ b/q2_types/per_sample_sequences/_type.py @@ -7,10 +7,9 @@ # ---------------------------------------------------------------------------- from q2_types.bowtie2 import Bowtie2IndexDirFmt -from q2_types.feature_data import BLAST6, FeatureData +from q2_types.feature_data import FeatureData from qiime2.plugin import SemanticType -from ..genome_data import SeedOrthologDirFmt from ..plugin_setup import plugin from ..sample_data import SampleData from . import (QIIME1DemuxDirFmt, SingleLanePerSampleSingleEndFastqDirFmt, @@ -102,7 +101,3 @@ SampleData[MultiAlignmentMap], artifact_format=MultiBAMDirFmt ) -plugin.register_semantic_type_to_format( - SampleData[BLAST6], - artifact_format=SeedOrthologDirFmt -) diff --git a/q2_types/per_sample_sequences/tests/test_type.py b/q2_types/per_sample_sequences/tests/test_type.py index 407bfb76..dc97eaf5 100644 --- a/q2_types/per_sample_sequences/tests/test_type.py +++ b/q2_types/per_sample_sequences/tests/test_type.py @@ -9,7 +9,7 @@ import unittest from q2_types.bowtie2 import Bowtie2IndexDirFmt -from q2_types.feature_data import BLAST6, FeatureData +from q2_types.feature_data import FeatureData from q2_types.sample_data import SampleData from q2_types.per_sample_sequences import ( Sequences, SequencesWithQuality, PairedEndSequencesWithQuality, @@ -23,7 +23,6 @@ ) from q2_types.per_sample_sequences._type import (AlignmentMap, MultiAlignmentMap) -from q2_types.genome_data import SeedOrthologDirFmt from qiime2.plugin.testing import TestPluginBase @@ -131,12 +130,6 @@ def test_multi_aln_map_semantic_type_to_format_registration(self): MultiBAMDirFmt ) - def test_sdb6_semantic_type_to_format_registration(self): - self.assertSemanticTypeRegisteredToFormat( - SampleData[BLAST6], - SeedOrthologDirFmt - ) - if __name__ == '__main__': unittest.main() diff --git a/setup.py b/setup.py index 94be4dd0..43bc0e99 100644 --- a/setup.py +++ b/setup.py @@ -65,11 +65,16 @@ ['data/*', 'data/*/*', 'data/mags-fa/*', 'data/mags-fasta/*'], 'q2_types.genome_data.tests': - ['data/*/', + ['data/*', 'data/genes/*', 'data/loci-invalid/*', 'data/loci/*', + 'data/genome-sequences/*', 'data/ortholog/*', + 'data/ortholog-annotation-extra/*', + 'data/ortholog-annotation-mags/*', + 'data/ortholog-annotation-samples/*', + 'data/ortholog-annotation/*', 'data/proteins/*', ], 'q2_types.kraken2.tests': [