Skip to content

Commit

Permalink
MAINT: refactor/clean up GenomeData types (#338)
Browse files Browse the repository at this point in the history
  • Loading branch information
misialq authored Jul 17, 2024
1 parent a22a64a commit 04dd0da
Show file tree
Hide file tree
Showing 27 changed files with 336 additions and 328 deletions.
4 changes: 1 addition & 3 deletions q2_types/feature_data/_type.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@
RNASequencesDirectoryFormat, AlignedRNASequencesDirectoryFormat,
PairedRNASequencesDirectoryFormat, BLAST6DirectoryFormat,
SequenceCharacteristicsDirectoryFormat)
from q2_types.sample_data import SampleData


FeatureData = SemanticType('FeatureData', field_names='type')
Expand Down Expand Up @@ -52,8 +51,7 @@
variant_of=FeatureData.field['type'])

BLAST6 = SemanticType('BLAST6',
variant_of=[FeatureData.field['type'],
SampleData.field['type']])
variant_of=FeatureData.field['type'])

SequenceCharacteristics = SemanticType('SequenceCharacteristics',
variant_of=FeatureData.field['type'])
Expand Down
13 changes: 3 additions & 10 deletions q2_types/feature_data_mag/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,19 +8,12 @@

import importlib

from ._format import (
MAGSequencesDirFmt,
OrthologAnnotationDirFmt,
OrthologFileFmt
)
from ._format import MAGSequencesDirFmt

from ._type import MAG, NOG, OG, KEGG, Contig
from ._type import MAG, Contig
from ._transformer import MAGIterator

__all__ = [
'MAG', 'MAGSequencesDirFmt', 'MAGIterator', 'NOG', 'OG', 'KEGG',
'OrthologAnnotationDirFmt', 'OrthologFileFmt', 'Contig'
]
__all__ = ['MAG', 'MAGSequencesDirFmt', 'MAGIterator', 'Contig']

importlib.import_module('q2_types.feature_data_mag._format')
importlib.import_module('q2_types.feature_data_mag._transformer')
Expand Down
28 changes: 0 additions & 28 deletions q2_types/feature_data_mag/_format.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@
import re

from q2_types.feature_data import DNAFASTAFormat
from q2_types.genome_data._format import OrthologFileFmt
from qiime2.plugin import model

from ..plugin_setup import plugin
Expand Down Expand Up @@ -62,30 +61,3 @@ def feature_dict(self, relative=False):


plugin.register_formats(MAGSequencesDirFmt)


class OrthologAnnotationDirFmt(model.DirectoryFormat):
pathspec = r'.+\.annotations'
annotations = model.FileCollection(pathspec, format=OrthologFileFmt)

@annotations.set_path_maker
def annotations_path_maker(self, file_name):
return file_name.split(sep="_")[0]

def annotation_dict(self, relative=False) -> dict:
ids = {}
for path in self.path.iterdir():
if re.compile(self.pathspec).match(path.name):
_id = re.sub('.emapper$', '', path.stem)
absolute_path = path.absolute()
if relative:
ids[_id] = str(
absolute_path.relative_to(self.path.absolute())
)
else:
ids[_id] = str(absolute_path)

return dict(sorted(ids.items()))


plugin.register_formats(OrthologAnnotationDirFmt)
88 changes: 1 addition & 87 deletions q2_types/feature_data_mag/_transformer.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,22 +5,17 @@
#
# The full license is in the file LICENSE, distributed with this software.
# ----------------------------------------------------------------------------
from io import StringIO

import uuid

import collections.abc
import glob
import os.path

import qiime2
from itertools import repeat

import pandas as pd
import skbio
from q2_types.feature_data._transformer import _fastaformats_to_series

from . import MAGSequencesDirFmt, OrthologAnnotationDirFmt
from . import MAGSequencesDirFmt
from ..plugin_setup import plugin

CONSTRUCTORS = {
Expand Down Expand Up @@ -94,84 +89,3 @@ def _5(data: MAGIterator) -> MAGSequencesDirFmt:
with open(fp, 'a') as fin:
skbio.io.write(seq, format='fasta', into=fin)
return result


def _is_valid_uuid4(uuid_string: str):
"""
Check if a given string is a valid UUID version 4.
This function checks if the provided string is a valid UUID version 4.
The only purpose of doing that here is to identify whether provided
string was a MAG ID (UUID4) or a sample ID. For that reason, we don't
print any statements or raise any exceptions.
Parameters:
uuid_string (str): The string to check for UUID version 4 validity.
Returns:
bool: True if the string is a valid UUID version 4, False otherwise.
"""
try:
uuid_obj = uuid.UUID(uuid_string, version=4)
except ValueError:
return False

return str(uuid_obj) == uuid_string


def _reshuffle_columns(df: pd.DataFrame):
if 'MAG' in df.columns:
col = 'MAG'
elif 'Sample' in df.columns:
col = 'Sample'
else:
return df

cols = df.columns.tolist()
cols.remove(col)
cols.insert(0, col)
df = df[cols]
return df


def _annotations_to_dataframe(
data: OrthologAnnotationDirFmt
) -> pd.DataFrame:
annotations = data.annotation_dict()
dfs = []
for _id, path in annotations.items():
# we need to ignore the ## comments at
# the beginning and end of the file
with open(path, 'r') as f:
lines = [line for line in f if not line.startswith('##')]

df = pd.read_csv(StringIO('\n'.join(lines)), sep='\t', index_col=0)
if _is_valid_uuid4(_id):
df['MAG'] = _id
else:
df['Sample'] = _id

dfs.append(df)

df = pd.concat(dfs)

# to satisfy QIIME2's particular requirements
df.reset_index(drop=False, inplace=True)
df.index = df.index.astype(str)
df.index.rename('id', inplace=True)

# reshuffle columns for nicer display
df = _reshuffle_columns(df)

return df


@plugin.register_transformer
def _7(data: OrthologAnnotationDirFmt) -> pd.DataFrame:
return _annotations_to_dataframe(data)


@plugin.register_transformer
def _8(data: OrthologAnnotationDirFmt) -> qiime2.Metadata:
annotations = _annotations_to_dataframe(data)
return qiime2.Metadata(annotations)
27 changes: 1 addition & 26 deletions q2_types/feature_data_mag/_type.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,7 @@

from q2_types.feature_data import FeatureData

from q2_types.feature_data_mag._format import (
MAGSequencesDirFmt, OrthologAnnotationDirFmt
)
from q2_types.feature_data_mag._format import MAGSequencesDirFmt
from qiime2.core.type import SemanticType

from ..bowtie2 import Bowtie2IndexDirFmt
Expand All @@ -34,29 +32,6 @@
artifact_format=ContigSequencesDirFmt
)

NOG = SemanticType('NOG', variant_of=FeatureData.field['type'])

plugin.register_semantic_types(NOG)
plugin.register_artifact_class(
FeatureData[NOG],
directory_format=OrthologAnnotationDirFmt)


OG = SemanticType('OG', variant_of=FeatureData.field['type'])

plugin.register_semantic_types(OG)
plugin.register_artifact_class(
FeatureData[OG],
directory_format=OrthologAnnotationDirFmt)


KEGG = SemanticType('KEGG', variant_of=FeatureData.field['type'])

plugin.register_semantic_types(KEGG)
plugin.register_artifact_class(
FeatureData[KEGG],
directory_format=OrthologAnnotationDirFmt)

plugin.register_semantic_type_to_format(
FeatureData[SingleBowtie2Index],
artifact_format=Bowtie2IndexDirFmt
Expand Down
38 changes: 1 addition & 37 deletions q2_types/feature_data_mag/tests/test_format.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,11 +11,8 @@
import unittest

from qiime2.plugin.testing import TestPluginBase
from qiime2.plugin import ValidationError

from q2_types.feature_data_mag._format import (
MAGSequencesDirFmt, OrthologAnnotationDirFmt,
)
from q2_types.feature_data_mag._format import MAGSequencesDirFmt


class TestFormats(TestPluginBase):
Expand Down Expand Up @@ -60,39 +57,6 @@ def test_mag_dirfmt_feature_dict(self):
}
self.assertDictEqual(obs, exp)

def test_ortholog_annotation_dir_fmt_passing(self):
dirpath = self.get_data_path('good_ortholog_annotation')
fmt_obj = OrthologAnnotationDirFmt(dirpath, mode='r')
fmt_obj.validate()

def test_ortholog_annotation_dir_fmt_fails_extra_file(self):
dirpath = self.get_data_path('ortholog_annotation_extra')
fmt_obj = OrthologAnnotationDirFmt(dirpath, mode='r')

with self.assertRaisesRegex(ValidationError, "Unrecognized file"):
fmt_obj.validate()

def test_ortholog_annotations_annot_dict(self):
annotations = OrthologAnnotationDirFmt(
self.get_data_path('ortholog_annotation_samples'), mode='r'
)

obs = annotations.annotation_dict()
exp = {
'test_output1':
str(annotations.path / 'test_output1.emapper.annotations'),
'test_output2':
str(annotations.path / 'test_output2.emapper.annotations')
}
self.assertDictEqual(obs, exp)

obs = annotations.annotation_dict(relative=True)
exp = {
'test_output1': 'test_output1.emapper.annotations',
'test_output2': 'test_output2.emapper.annotations'
}
self.assertDictEqual(obs, exp)


if __name__ == '__main__':
unittest.main()
57 changes: 2 additions & 55 deletions q2_types/feature_data_mag/tests/test_transformers.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@
# ----------------------------------------------------------------------------

import glob
import qiime2
import unittest
from itertools import repeat

Expand All @@ -17,11 +16,9 @@
from skbio import DNA

from q2_types.feature_data_mag import (
MAGSequencesDirFmt, MAGIterator, OrthologAnnotationDirFmt
)
from q2_types.feature_data_mag._transformer import (
_get_filename, _annotations_to_dataframe
MAGSequencesDirFmt, MAGIterator
)
from q2_types.feature_data_mag._transformer import _get_filename


class TestTransformers(TestPluginBase):
Expand Down Expand Up @@ -132,56 +129,6 @@ def test_mag_iterator_to_mag_sequences_dir_fmt(self):
obs_seqs = self.read_seqs_into_dict(str(obs))
self.assertDictEqual(self.mags_fa, obs_seqs)

def test_annotations_to_dataframe_samples(self):
annotations = OrthologAnnotationDirFmt(
self.get_data_path('ortholog_annotation_samples'),
mode='r'
)
obs = _annotations_to_dataframe(annotations)
self.assertEqual((11, 22), obs.shape)
self.assertTrue(obs.columns[0] == "Sample")
self.assertTrue(obs.index.is_unique)
self.assertEqual("id", obs.index.name)

def test_annotations_to_dataframe_mags(self):
annotations = OrthologAnnotationDirFmt(
self.get_data_path('ortholog_annotation_mags'),
mode='r'
)
obs = _annotations_to_dataframe(annotations)
self.assertEqual((11, 22), obs.shape)
self.assertTrue(obs.columns[0] == "MAG")
self.assertTrue(obs.index.is_unique)
self.assertEqual("id", obs.index.name)

def test_annotations_to_df_transformer(self):
annotations = OrthologAnnotationDirFmt(
self.get_data_path('ortholog_annotation_mags'),
mode='r'
)
transformer = self.get_transformer(
OrthologAnnotationDirFmt, pd.DataFrame
)

obs = transformer(annotations)
self.assertIsInstance(obs, pd.DataFrame)
self.assertEqual((11, 22), obs.shape)
self.assertTrue(obs.columns[0] == "MAG")
self.assertTrue(obs.index.is_unique)
self.assertEqual("id", obs.index.name)

def test_annotations_to_metadata_transformer(self):
annotations = OrthologAnnotationDirFmt(
self.get_data_path('ortholog_annotation_mags'),
mode='r'
)
transformer = self.get_transformer(
OrthologAnnotationDirFmt, qiime2.Metadata
)

obs = transformer(annotations)
self.assertIsInstance(obs, qiime2.Metadata)


if __name__ == '__main__':
unittest.main()
Loading

0 comments on commit 04dd0da

Please sign in to comment.