qiime2 · lizgehret · May 16, 2024 · May 8, 2024 · May 8, 2024 · May 14, 2024
diff --git a/q2_types/feature_data/__init__.py b/q2_types/feature_data/__init__.py
@@ -27,11 +27,13 @@
     MixedCaseAlignedDNAFASTAFormat,
     MixedCaseAlignedDNASequencesDirectoryFormat,
     MixedCaseAlignedRNAFASTAFormat,
-    MixedCaseAlignedRNASequencesDirectoryFormat)
+    MixedCaseAlignedRNASequencesDirectoryFormat,
+    SequenceCharacteristicsDirectoryFormat,
+    SequenceCharacteristicsFormat)
 from ._type import (
     FeatureData, Taxonomy, Sequence, PairedEndSequence, AlignedSequence,
     Differential, ProteinSequence, AlignedProteinSequence, RNASequence,
-    AlignedRNASequence, PairedEndRNASequence, BLAST6)
+    AlignedRNASequence, PairedEndRNASequence, BLAST6, SequenceCharacteristics)
 
 # TODO remove these imports when tests are rewritten. Remove from __all__ too
 from ._transformer import (
@@ -67,6 +69,8 @@
     'MixedCaseAlignedProteinFASTAFormat',
     'MixedCaseProteinSequencesDirectoryFormat',
     'MixedCaseAlignedProteinSequencesDirectoryFormat',
+    'SequenceCharacteristics', 'SequenceCharacteristicsDirectoryFormat',
+    'SequenceCharacteristicsFormat'
     ]
 
 importlib.import_module('q2_types.feature_data._transformer')
diff --git a/q2_types/feature_data/_format.py b/q2_types/feature_data/_format.py
@@ -470,6 +470,36 @@ def validate(self, *args):
     'BLAST6DirectoryFormat', 'blast6.tsv', BLAST6Format)
 
 
+class SequenceCharacteristicsFormat(model.TextFileFormat):
+    """
+    Format for a TSV file with information about sequences like length of a
+    feature. The first column contains feature identifiers and is followed by
+    other optional columns.
+
+    The file cannot be empty and must have at least two columns.
+
+    Validation for additional columns can be added with a semantic validator
+    tied to a property. For example the
+    "validate_sequence_characteristics_length" validator for
+    "FeatureData[SequenceCharacteristics % Properties("length")]"
+    adds validation for a numerical column called "length".
+    """
+
+    def validate(self, n_records=None):
+        try:
+            data = pd.read_csv(str(self), sep="\t", index_col=0)
+        except pd.errors.EmptyDataError:
+            raise ValidationError('File cannot be empty.')
+
+        if not data.columns.any():
+            raise ValidationError('File needs to have at least two columns.')
+
+
+SequenceCharacteristicsDirectoryFormat = model.SingleFileDirectoryFormat(
+    "SequenceCharacteristicsDirectoryFormat",
+    "sequence_characteristics.txt", SequenceCharacteristicsFormat
+)
+
 plugin.register_formats(
     TSVTaxonomyFormat, TSVTaxonomyDirectoryFormat,
     HeaderlessTSVTaxonomyFormat, HeaderlessTSVTaxonomyDirectoryFormat,
@@ -489,5 +519,6 @@ def validate(self, *args):
     MixedCaseRNASequencesDirectoryFormat, MixedCaseAlignedDNAFASTAFormat,
     MixedCaseAlignedDNASequencesDirectoryFormat,
     MixedCaseAlignedRNAFASTAFormat,
-    MixedCaseAlignedRNASequencesDirectoryFormat
+    MixedCaseAlignedRNASequencesDirectoryFormat, SequenceCharacteristicsFormat,
+    SequenceCharacteristicsDirectoryFormat
 )
diff --git a/q2_types/feature_data/_transformer.py b/q2_types/feature_data/_transformer.py
@@ -22,7 +22,8 @@
                AlignedProteinFASTAFormat, RNAFASTAFormat,
                AlignedRNAFASTAFormat, PairedRNASequencesDirectoryFormat,
                BLAST6Format, MixedCaseDNAFASTAFormat, MixedCaseRNAFASTAFormat,
-               MixedCaseAlignedDNAFASTAFormat, MixedCaseAlignedRNAFASTAFormat)
+               MixedCaseAlignedDNAFASTAFormat, MixedCaseAlignedRNAFASTAFormat,
+               SequenceCharacteristicsFormat)
 
 
 # Taxonomy format transformers
@@ -797,3 +798,15 @@ def _227(ff: BLAST6Format) -> qiime2.Metadata:
     # default int index but cast to a str and give it a name.
     data.index = pd.Index(data.index.astype(str), name='id')
     return qiime2.Metadata(data)
+
+
+@plugin.register_transformer
+def _228(ff: SequenceCharacteristicsFormat) -> pd.DataFrame:
+    return pd.read_csv(str(ff), sep="\t", index_col=0)
+
+
+@plugin.register_transformer
+def _229(data: pd.DataFrame) -> SequenceCharacteristicsFormat:
+    ff = SequenceCharacteristicsFormat()
+    data.to_csv(str(ff), sep='\t')
+    return ff
diff --git a/q2_types/feature_data/_type.py b/q2_types/feature_data/_type.py
@@ -5,7 +5,9 @@
 #
 # The full license is in the file LICENSE, distributed with this software.
 # ----------------------------------------------------------------------------
-
+import pandas as pd
+from qiime2.core.exceptions import ValidationError
+from qiime2.core.type import Properties
 from qiime2.plugin import SemanticType
 
 from ..plugin_setup import plugin
@@ -15,7 +17,8 @@
                DifferentialDirectoryFormat, ProteinSequencesDirectoryFormat,
                AlignedProteinSequencesDirectoryFormat,
                RNASequencesDirectoryFormat, AlignedRNASequencesDirectoryFormat,
-               PairedRNASequencesDirectoryFormat, BLAST6DirectoryFormat)
+               PairedRNASequencesDirectoryFormat, BLAST6DirectoryFormat,
+               SequenceCharacteristicsDirectoryFormat)
 from q2_types.sample_data import SampleData
 
 
@@ -52,13 +55,40 @@
                       variant_of=[FeatureData.field['type'],
                                   SampleData.field['type']])
 
+SequenceCharacteristics = SemanticType('SequenceCharacteristics',
+                                       variant_of=FeatureData.field['type'])
+
+
+@plugin.register_validator(FeatureData[SequenceCharacteristics %
+                                       Properties("length")])
+def validate_sequence_characteristics_length(data: pd.DataFrame, level):
+    """
+    Semantic validator that validates a numerical column called 'length',
+    which cannot contain empty or negative values, for the
+    FeatureData[SequenceCharacteristics] type with property "length".
+    """
+    if 'length' not in data.columns:
+        raise ValidationError("Column 'length' has to exist in the file.")
+
+    if data['length'].isnull().any():
+        raise ValidationError("Column 'length' cannot contain empty (NaN) "
+                              "values.")
+
+    if not pd.api.types.is_numeric_dtype(data['length']):
+        raise ValidationError("Values in column 'length' have to be "
+                              "numerical.")
+
+    if not (data['length'] > 0).all():
+        raise ValidationError("Column 'length' cannot contain negative "
+                              "values.")
+
+
 plugin.register_semantic_types(FeatureData, Taxonomy, Sequence,
                                PairedEndSequence, AlignedSequence,
                                Differential, ProteinSequence,
                                AlignedProteinSequence, RNASequence,
                                AlignedRNASequence, PairedEndRNASequence,
-                               BLAST6)
-
+                               BLAST6, SequenceCharacteristics)
 
 plugin.register_artifact_class(
     FeatureData[Taxonomy],
@@ -120,3 +150,8 @@
     directory_format=BLAST6DirectoryFormat,
     description=("BLAST results associated with a set of feature "
                  "identifiers."))
+plugin.register_artifact_class(
+    FeatureData[SequenceCharacteristics],
+    directory_format=SequenceCharacteristicsDirectoryFormat,
+    description=("Characteristics of sequences (e.g., the length of a genes "
+                 "in basepairs)."))
diff --git a/q2_types/feature_data/tests/data/empty.txt b/q2_types/feature_data/tests/data/empty.txt
diff --git a/q2_types/feature_data/tests/data/sequence_characteristics_length.txt b/q2_types/feature_data/tests/data/sequence_characteristics_length.txt
@@ -0,0 +1,3 @@
+id	length
+1	876
+2	54
diff --git a/q2_types/feature_data/tests/data/sequence_characteristics_only_index.txt b/q2_types/feature_data/tests/data/sequence_characteristics_only_index.txt
@@ -0,0 +1,3 @@
+id
+1
+2
diff --git a/q2_types/feature_data/tests/test_format.py b/q2_types/feature_data/tests/test_format.py
@@ -29,7 +29,9 @@
     MixedCaseRNAFASTAFormat, MixedCaseRNASequencesDirectoryFormat,
     MixedCaseAlignedDNAFASTAFormat,
     MixedCaseAlignedDNASequencesDirectoryFormat,
-    MixedCaseAlignedRNAFASTAFormat, MixedCaseAlignedRNASequencesDirectoryFormat
+    MixedCaseAlignedRNAFASTAFormat,
+    MixedCaseAlignedRNASequencesDirectoryFormat,
+    SequenceCharacteristicsDirectoryFormat, SequenceCharacteristicsFormat
 )
 from qiime2.plugin.testing import TestPluginBase
 from qiime2.plugin import ValidationError
@@ -901,5 +903,37 @@ def test_blast6_format_invalid(self):
             BLAST6DirectoryFormat(temp_dir, mode='r').validate()
 
 
+class TestSequenceCharacteristicsFormat(TestPluginBase):
+    package = 'q2_types.feature_data.tests'
+
+    def test_sequence_characteristics_directory_format(self):
+        filepath = self.get_data_path('sequence_characteristics_length.txt')
+        temp_dir = self.temp_dir.name
+        shutil.copy(filepath, os.path.join(temp_dir,
+                                           'sequence_characteristics.txt'))
+        format = SequenceCharacteristicsDirectoryFormat(temp_dir, mode='r')
+        format.validate()
+
+    def test_sequence_characteristics_format(self):
+        filepath = self.get_data_path('sequence_characteristics_length.txt')
+        format = SequenceCharacteristicsFormat(filepath, mode='r')
+        format.validate()
+
+    def test_sequence_characteristics_format_empty(self):
+        path = self.get_data_path('empty.txt')
+        format = SequenceCharacteristicsFormat(path, mode='r')
+        with self.assertRaises(ValidationError) as context:
+            format.validate()
+        self.assertEqual(str(context.exception), 'File cannot be empty.')
+
+    def test_sequence_characteristics_format_only_index(self):
+        path = self.get_data_path('sequence_characteristics_only_index.txt')
+        format = SequenceCharacteristicsFormat(path, mode='r')
+        with self.assertRaises(ValidationError) as context:
+            format.validate()
+        self.assertEqual(str(context.exception),
+                         'File needs to have at least two columns.')
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/q2_types/feature_data/tests/test_transformer.py b/q2_types/feature_data/tests/test_transformer.py
@@ -5,7 +5,7 @@
 #
 # The full license is in the file LICENSE, distributed with this software.
 # ----------------------------------------------------------------------------
-
+import filecmp
 import os.path
 import unittest
 
@@ -25,7 +25,7 @@
     AlignedProteinFASTAFormat, RNAFASTAFormat, AlignedRNAFASTAFormat,
     RNAIterator, AlignedRNAIterator, BLAST6Format, MixedCaseDNAFASTAFormat,
     MixedCaseRNAFASTAFormat, MixedCaseAlignedDNAFASTAFormat,
-    MixedCaseAlignedRNAFASTAFormat
+    MixedCaseAlignedRNAFASTAFormat, SequenceCharacteristicsFormat
 )
 from q2_types.feature_data._transformer import (
     _taxonomy_formats_to_dataframe, _dataframe_to_tsv_taxonomy_format,
@@ -1507,5 +1507,32 @@ def test_blast6_to_metadata(self):
         assert_frame_equal(obs.to_dataframe(), exp)
 
 
+class TestSequenceCharacteristicsTransformer(TestPluginBase):
+    package = 'q2_types.feature_data.tests'
+
+    def setUp(self):
+        super().setUp()
+        self.exp_file = self.get_data_path(
+            "sequence_characteristics_length.txt")
+        self.exp_df = pd.DataFrame({'length': [876, 54]},
+                                   index=pd.Index([1, 2], name='id'))
+
+    def test_df_to_sequence_characteristics_format(self):
+        transformer = self.get_transformer(pd.DataFrame,
+                                           SequenceCharacteristicsFormat)
+        obs = transformer(self.exp_df)
+
+        self.assertIsInstance(obs, SequenceCharacteristicsFormat)
+        assert filecmp.cmp(self.exp_file, obs.path)
+
+    def test_sequence_characteristics_format_to_df(self):
+        transformer = self.get_transformer(SequenceCharacteristicsFormat,
+                                           pd.DataFrame)
+        format = SequenceCharacteristicsFormat(self.exp_file, mode="r")
+        obs = transformer(format)
+
+        assert_frame_equal(self.exp_df, obs)
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/q2_types/feature_data/tests/test_type.py b/q2_types/feature_data/tests/test_type.py
@@ -8,6 +8,9 @@
 
 import unittest
 
+import pandas as pd
+from qiime2.core.exceptions import ValidationError
+
 from q2_types.feature_data import (
     FeatureData, Taxonomy, Sequence, PairedEndSequence, AlignedSequence,
     Differential, TSVTaxonomyDirectoryFormat, DNASequencesDirectoryFormat,
@@ -17,10 +20,14 @@
     AlignedProteinSequence, RNASequence, RNASequencesDirectoryFormat,
     AlignedRNASequencesDirectoryFormat, AlignedRNASequence,
     PairedRNASequencesDirectoryFormat, PairedEndRNASequence,
-    BLAST6, BLAST6DirectoryFormat
+    BLAST6, BLAST6DirectoryFormat, SequenceCharacteristics,
+    SequenceCharacteristicsDirectoryFormat
 )
 from qiime2.plugin.testing import TestPluginBase
 
+from q2_types.feature_data._type import \
+    validate_sequence_characteristics_length
+
 
 class TestTypes(TestPluginBase):
     package = 'q2_types.feature_data.tests'
@@ -118,6 +125,51 @@ def test_blast6_semantic_type_to_format_registration(self):
         self.assertSemanticTypeRegisteredToFormat(
                 FeatureData[BLAST6], BLAST6DirectoryFormat)
 
+    def test_sequence_characteristics_semantic_type_registration(self):
+        self.assertRegisteredSemanticType(SequenceCharacteristics)
+
+    def test_sequence_characteristics_semantic_type_format_registration(self):
+        self.assertSemanticTypeRegisteredToFormat(
+            FeatureData[SequenceCharacteristics],
+            SequenceCharacteristicsDirectoryFormat)
+
+    def test_validate_sequence_characteristics_length(self):
+        data = self._setup_df()
+        validate_sequence_characteristics_length(data, None)
+
+    def test_validate_sequence_characteristics_length_no_length_column(self):
+        data = self._setup_df()
+        data.drop(columns=['length'], inplace=True)
+        self._assert_validation_error(data, "Column 'length' has to exist in "
+                                            "the file.")
+
+    def test_validate_sequence_characteristics_length_not_numerical(self):
+        data = self._setup_df()
+        data.loc[1, 'length'] = 'a'
+        self._assert_validation_error(data, "Values in column 'length' have "
+                                            "to be numerical.")
+
+    def test_validate_sequence_characteristics_length_empty_values(self):
+        data = self._setup_df()
+        data.loc[1, 'length'] = None
+        self._assert_validation_error(data, "Column 'length' cannot contain "
+                                            "empty (NaN) values.")
+
+    def test_validate_sequence_characteristics_length_negative_values(self):
+        data = self._setup_df()
+        data.loc[1, 'length'] = -1
+        self._assert_validation_error(data, "Column 'length' cannot contain "
+                                            "negative values.")
+
+    def _setup_df(self):
+        data_path = self.get_data_path("sequence_characteristics_length.txt")
+        return pd.read_csv(data_path, sep="\t", index_col=0)
+
+    def _assert_validation_error(self, data, error_message):
+        with self.assertRaises(ValidationError) as context:
+            validate_sequence_characteristics_length(data, None)
+        self.assertEqual(str(context.exception), error_message)
+
 
 if __name__ == "__main__":
     unittest.main()