Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ENH: Added new type FeatureData[SequenceCharacteristics] with semantic validator for property 'length'. #326

Merged
8 changes: 6 additions & 2 deletions q2_types/feature_data/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,11 +27,13 @@
MixedCaseAlignedDNAFASTAFormat,
MixedCaseAlignedDNASequencesDirectoryFormat,
MixedCaseAlignedRNAFASTAFormat,
MixedCaseAlignedRNASequencesDirectoryFormat)
MixedCaseAlignedRNASequencesDirectoryFormat,
SequenceCharacteristicsDirectoryFormat,
SequenceCharacteristicsFormat)
from ._type import (
FeatureData, Taxonomy, Sequence, PairedEndSequence, AlignedSequence,
Differential, ProteinSequence, AlignedProteinSequence, RNASequence,
AlignedRNASequence, PairedEndRNASequence, BLAST6)
AlignedRNASequence, PairedEndRNASequence, BLAST6, SequenceCharacteristics)

# TODO remove these imports when tests are rewritten. Remove from __all__ too
from ._transformer import (
Expand Down Expand Up @@ -67,6 +69,8 @@
'MixedCaseAlignedProteinFASTAFormat',
'MixedCaseProteinSequencesDirectoryFormat',
'MixedCaseAlignedProteinSequencesDirectoryFormat',
'SequenceCharacteristics', 'SequenceCharacteristicsDirectoryFormat',
'SequenceCharacteristicsFormat'
]

importlib.import_module('q2_types.feature_data._transformer')
33 changes: 32 additions & 1 deletion q2_types/feature_data/_format.py
Original file line number Diff line number Diff line change
Expand Up @@ -470,6 +470,36 @@ def validate(self, *args):
'BLAST6DirectoryFormat', 'blast6.tsv', BLAST6Format)


class SequenceCharacteristicsFormat(model.TextFileFormat):
"""
Format for a TSV file with information about sequences like length of a
feature. The first column contains feature identifiers and is followed by
other optional columns.

The file cannot be empty and must have at least two columns.

Validation for additional columns can be added with a semantic validator
tied to a property. For example the
"validate_sequence_characteristics_length" validator for
"FeatureData[SequenceCharacteristics % Properties("length")]"
adds validation for a numerical column called "length".
"""

def validate(self, n_records=None):
try:
data = pd.read_csv(str(self), sep="\t", index_col=0)
except pd.errors.EmptyDataError:
raise ValidationError('File cannot be empty.')

if not data.columns.any():
raise ValidationError('File needs to have at least two columns.')


SequenceCharacteristicsDirectoryFormat = model.SingleFileDirectoryFormat(
"SequenceCharacteristicsDirectoryFormat",
"sequence_characteristics.txt", SequenceCharacteristicsFormat
lizgehret marked this conversation as resolved.
Show resolved Hide resolved
)

plugin.register_formats(
TSVTaxonomyFormat, TSVTaxonomyDirectoryFormat,
HeaderlessTSVTaxonomyFormat, HeaderlessTSVTaxonomyDirectoryFormat,
Expand All @@ -489,5 +519,6 @@ def validate(self, *args):
MixedCaseRNASequencesDirectoryFormat, MixedCaseAlignedDNAFASTAFormat,
MixedCaseAlignedDNASequencesDirectoryFormat,
MixedCaseAlignedRNAFASTAFormat,
MixedCaseAlignedRNASequencesDirectoryFormat
MixedCaseAlignedRNASequencesDirectoryFormat, SequenceCharacteristicsFormat,
SequenceCharacteristicsDirectoryFormat
)
15 changes: 14 additions & 1 deletion q2_types/feature_data/_transformer.py
VinzentRisch marked this conversation as resolved.
Show resolved Hide resolved
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,8 @@
AlignedProteinFASTAFormat, RNAFASTAFormat,
AlignedRNAFASTAFormat, PairedRNASequencesDirectoryFormat,
BLAST6Format, MixedCaseDNAFASTAFormat, MixedCaseRNAFASTAFormat,
MixedCaseAlignedDNAFASTAFormat, MixedCaseAlignedRNAFASTAFormat)
MixedCaseAlignedDNAFASTAFormat, MixedCaseAlignedRNAFASTAFormat,
SequenceCharacteristicsFormat)


# Taxonomy format transformers
Expand Down Expand Up @@ -797,3 +798,15 @@ def _227(ff: BLAST6Format) -> qiime2.Metadata:
# default int index but cast to a str and give it a name.
data.index = pd.Index(data.index.astype(str), name='id')
return qiime2.Metadata(data)


@plugin.register_transformer
def _228(ff: SequenceCharacteristicsFormat) -> pd.DataFrame:
return pd.read_csv(str(ff), sep="\t", index_col=0)
VinzentRisch marked this conversation as resolved.
Show resolved Hide resolved


@plugin.register_transformer
def _229(data: pd.DataFrame) -> SequenceCharacteristicsFormat:
ff = SequenceCharacteristicsFormat()
data.to_csv(str(ff), sep='\t')
return ff
43 changes: 39 additions & 4 deletions q2_types/feature_data/_type.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,9 @@
#
# The full license is in the file LICENSE, distributed with this software.
# ----------------------------------------------------------------------------

import pandas as pd
from qiime2.core.exceptions import ValidationError
from qiime2.core.type import Properties
from qiime2.plugin import SemanticType

from ..plugin_setup import plugin
Expand All @@ -15,7 +17,8 @@
DifferentialDirectoryFormat, ProteinSequencesDirectoryFormat,
AlignedProteinSequencesDirectoryFormat,
RNASequencesDirectoryFormat, AlignedRNASequencesDirectoryFormat,
PairedRNASequencesDirectoryFormat, BLAST6DirectoryFormat)
PairedRNASequencesDirectoryFormat, BLAST6DirectoryFormat,
SequenceCharacteristicsDirectoryFormat)
from q2_types.sample_data import SampleData


Expand Down Expand Up @@ -52,13 +55,40 @@
variant_of=[FeatureData.field['type'],
SampleData.field['type']])

SequenceCharacteristics = SemanticType('SequenceCharacteristics',
variant_of=FeatureData.field['type'])


@plugin.register_validator(FeatureData[SequenceCharacteristics %
Properties("length")])
def validate_sequence_characteristics_length(data: pd.DataFrame, level):
VinzentRisch marked this conversation as resolved.
Show resolved Hide resolved
"""
Semantic validator that validates a numerical column called 'length',
which cannot contain empty or negative values, for the
FeatureData[SequenceCharacteristics] type with property "length".
"""
if 'length' not in data.columns:
raise ValidationError("Column 'length' has to exist in the file.")

if data['length'].isnull().any():
raise ValidationError("Column 'length' cannot contain empty (NaN) "
"values.")

if not pd.api.types.is_numeric_dtype(data['length']):
raise ValidationError("Values in column 'length' have to be "
"numerical.")

if not (data['length'] > 0).all():
raise ValidationError("Column 'length' cannot contain negative "
"values.")


plugin.register_semantic_types(FeatureData, Taxonomy, Sequence,
PairedEndSequence, AlignedSequence,
Differential, ProteinSequence,
AlignedProteinSequence, RNASequence,
AlignedRNASequence, PairedEndRNASequence,
BLAST6)

BLAST6, SequenceCharacteristics)

plugin.register_artifact_class(
FeatureData[Taxonomy],
Expand Down Expand Up @@ -120,3 +150,8 @@
directory_format=BLAST6DirectoryFormat,
description=("BLAST results associated with a set of feature "
"identifiers."))
plugin.register_artifact_class(
FeatureData[SequenceCharacteristics],
directory_format=SequenceCharacteristicsDirectoryFormat,
description=("Characteristics of sequences (e.g., the length of a genes "
VinzentRisch marked this conversation as resolved.
Show resolved Hide resolved
"in basepairs)."))
Empty file.
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
id length
1 876
2 54
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
id
1
2
36 changes: 35 additions & 1 deletion q2_types/feature_data/tests/test_format.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,9 @@
MixedCaseRNAFASTAFormat, MixedCaseRNASequencesDirectoryFormat,
MixedCaseAlignedDNAFASTAFormat,
MixedCaseAlignedDNASequencesDirectoryFormat,
MixedCaseAlignedRNAFASTAFormat, MixedCaseAlignedRNASequencesDirectoryFormat
MixedCaseAlignedRNAFASTAFormat,
MixedCaseAlignedRNASequencesDirectoryFormat,
SequenceCharacteristicsDirectoryFormat, SequenceCharacteristicsFormat
)
from qiime2.plugin.testing import TestPluginBase
from qiime2.plugin import ValidationError
Expand Down Expand Up @@ -901,5 +903,37 @@ def test_blast6_format_invalid(self):
BLAST6DirectoryFormat(temp_dir, mode='r').validate()


class TestSequenceCharacteristicsFormat(TestPluginBase):
package = 'q2_types.feature_data.tests'

def test_sequence_characteristics_directory_format(self):
filepath = self.get_data_path('sequence_characteristics_length.txt')
VinzentRisch marked this conversation as resolved.
Show resolved Hide resolved
temp_dir = self.temp_dir.name
shutil.copy(filepath, os.path.join(temp_dir,
'sequence_characteristics.txt'))
VinzentRisch marked this conversation as resolved.
Show resolved Hide resolved
format = SequenceCharacteristicsDirectoryFormat(temp_dir, mode='r')
format.validate()

def test_sequence_characteristics_format(self):
filepath = self.get_data_path('sequence_characteristics_length.txt')
VinzentRisch marked this conversation as resolved.
Show resolved Hide resolved
format = SequenceCharacteristicsFormat(filepath, mode='r')
format.validate()

def test_sequence_characteristics_format_empty(self):
path = self.get_data_path('empty.txt')
VinzentRisch marked this conversation as resolved.
Show resolved Hide resolved
format = SequenceCharacteristicsFormat(path, mode='r')
with self.assertRaises(ValidationError) as context:
format.validate()
self.assertEqual(str(context.exception), 'File cannot be empty.')

def test_sequence_characteristics_format_only_index(self):
path = self.get_data_path('sequence_characteristics_only_index.txt')
VinzentRisch marked this conversation as resolved.
Show resolved Hide resolved
format = SequenceCharacteristicsFormat(path, mode='r')
with self.assertRaises(ValidationError) as context:
format.validate()
self.assertEqual(str(context.exception),
'File needs to have at least two columns.')


if __name__ == '__main__':
unittest.main()
31 changes: 29 additions & 2 deletions q2_types/feature_data/tests/test_transformer.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
#
# The full license is in the file LICENSE, distributed with this software.
# ----------------------------------------------------------------------------

import filecmp
import os.path
import unittest

Expand All @@ -25,7 +25,7 @@
AlignedProteinFASTAFormat, RNAFASTAFormat, AlignedRNAFASTAFormat,
RNAIterator, AlignedRNAIterator, BLAST6Format, MixedCaseDNAFASTAFormat,
MixedCaseRNAFASTAFormat, MixedCaseAlignedDNAFASTAFormat,
MixedCaseAlignedRNAFASTAFormat
MixedCaseAlignedRNAFASTAFormat, SequenceCharacteristicsFormat
)
from q2_types.feature_data._transformer import (
_taxonomy_formats_to_dataframe, _dataframe_to_tsv_taxonomy_format,
Expand Down Expand Up @@ -1507,5 +1507,32 @@ def test_blast6_to_metadata(self):
assert_frame_equal(obs.to_dataframe(), exp)


class TestSequenceCharacteristicsTransformer(TestPluginBase):
package = 'q2_types.feature_data.tests'

def setUp(self):
super().setUp()
self.exp_file = self.get_data_path(
"sequence_characteristics_length.txt")
VinzentRisch marked this conversation as resolved.
Show resolved Hide resolved
self.exp_df = pd.DataFrame({'length': [876, 54]},
index=pd.Index([1, 2], name='id'))

def test_df_to_sequence_characteristics_format(self):
transformer = self.get_transformer(pd.DataFrame,
SequenceCharacteristicsFormat)
obs = transformer(self.exp_df)

self.assertIsInstance(obs, SequenceCharacteristicsFormat)
assert filecmp.cmp(self.exp_file, obs.path)

def test_sequence_characteristics_format_to_df(self):
transformer = self.get_transformer(SequenceCharacteristicsFormat,
pd.DataFrame)
format = SequenceCharacteristicsFormat(self.exp_file, mode="r")
obs = transformer(format)

assert_frame_equal(self.exp_df, obs)


if __name__ == '__main__':
unittest.main()
54 changes: 53 additions & 1 deletion q2_types/feature_data/tests/test_type.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,9 @@

import unittest

import pandas as pd
from qiime2.core.exceptions import ValidationError

from q2_types.feature_data import (
FeatureData, Taxonomy, Sequence, PairedEndSequence, AlignedSequence,
Differential, TSVTaxonomyDirectoryFormat, DNASequencesDirectoryFormat,
Expand All @@ -17,10 +20,14 @@
AlignedProteinSequence, RNASequence, RNASequencesDirectoryFormat,
AlignedRNASequencesDirectoryFormat, AlignedRNASequence,
PairedRNASequencesDirectoryFormat, PairedEndRNASequence,
BLAST6, BLAST6DirectoryFormat
BLAST6, BLAST6DirectoryFormat, SequenceCharacteristics,
SequenceCharacteristicsDirectoryFormat
)
from qiime2.plugin.testing import TestPluginBase

from q2_types.feature_data._type import \
validate_sequence_characteristics_length


class TestTypes(TestPluginBase):
package = 'q2_types.feature_data.tests'
Expand Down Expand Up @@ -118,6 +125,51 @@ def test_blast6_semantic_type_to_format_registration(self):
self.assertSemanticTypeRegisteredToFormat(
FeatureData[BLAST6], BLAST6DirectoryFormat)

def test_sequence_characteristics_semantic_type_registration(self):
self.assertRegisteredSemanticType(SequenceCharacteristics)

def test_sequence_characteristics_semantic_type_format_registration(self):
self.assertSemanticTypeRegisteredToFormat(
FeatureData[SequenceCharacteristics],
SequenceCharacteristicsDirectoryFormat)

def test_validate_sequence_characteristics_length(self):
data = self._setup_df()
validate_sequence_characteristics_length(data, None)

def test_validate_sequence_characteristics_length_no_length_column(self):
data = self._setup_df()
data.drop(columns=['length'], inplace=True)
self._assert_validation_error(data, "Column 'length' has to exist in "
"the file.")

def test_validate_sequence_characteristics_length_not_numerical(self):
data = self._setup_df()
data.loc[1, 'length'] = 'a'
self._assert_validation_error(data, "Values in column 'length' have "
"to be numerical.")

def test_validate_sequence_characteristics_length_empty_values(self):
data = self._setup_df()
data.loc[1, 'length'] = None
self._assert_validation_error(data, "Column 'length' cannot contain "
"empty (NaN) values.")

def test_validate_sequence_characteristics_length_negative_values(self):
data = self._setup_df()
data.loc[1, 'length'] = -1
self._assert_validation_error(data, "Column 'length' cannot contain "
"negative values.")

def _setup_df(self):
data_path = self.get_data_path("sequence_characteristics_length.txt")
VinzentRisch marked this conversation as resolved.
Show resolved Hide resolved
return pd.read_csv(data_path, sep="\t", index_col=0)

def _assert_validation_error(self, data, error_message):
with self.assertRaises(ValidationError) as context:
validate_sequence_characteristics_length(data, None)
self.assertEqual(str(context.exception), error_message)


if __name__ == "__main__":
unittest.main()
Loading