qiime2 · colinvwood · Dec 23, 2024 · Dec 23, 2024 · Dec 23, 2024 · Dec 23, 2024
diff --git a/conda-recipe/meta.yaml b/conda-recipe/meta.yaml
@@ -23,6 +23,7 @@ requirements:
   - qiime2 {{ qiime2_epoch }}.*
   - samtools
   - pyhmmer
+  - frictionless<=5.5.0
   build:
   - setuptools
   - versioningit

diff --git a/q2_types/plugin_setup.py b/q2_types/plugin_setup.py
@@ -153,3 +153,4 @@
 importlib.import_module('q2_types.reference_db._deferred_setup')
 importlib.import_module('q2_types.sample_data._deferred_setup')
 importlib.import_module('q2_types.tree._deferred_setup')
+importlib.import_module('q2_types.tabular._deferred_setup')
diff --git a/q2_types/tabular/__init__.py b/q2_types/tabular/__init__.py
@@ -0,0 +1,20 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2024, QIIME 2 development team.
+#
+# Distributed under the terms of the Modified BSD License.
+#
+# The full license is in the file LICENSE, distributed with this software.
+# ----------------------------------------------------------------------------
+
+from .formats import (TableJSONLFileFormat, TableJSONLDirFmt,
+                      NDJSONFileFormat, DataResourceSchemaFileFormat,
+                      TabularDataResourceDirFmt)
+from .types import (StatsTable, Pairwise, Dist1D, Ordered, Unordered,
+                    NestedOrdered, NestedUnordered, Multi,
+                    Matched, Independent)
+
+__all__ = ['TableJSONLFileFormat', 'TableJSONLDirFmt',
+           'NDJSONFileFormat', 'DataResourceSchemaFileFormat',
+           'TabularDataResourceDirFmt', 'StatsTable', 'Pairwise',
+           'Dist1D', 'Ordered', 'Unordered', 'NestedOrdered',
+           'NestedUnordered', 'Multi', 'Matched', 'Independent']
diff --git a/q2_types/tabular/_deferred_setup/__init__.py b/q2_types/tabular/_deferred_setup/__init__.py
@@ -0,0 +1,45 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2024, QIIME 2 development team.
+#
+# Distributed under the terms of the Modified BSD License.
+#
+# The full license is in the file LICENSE, distributed with this software.
+# ----------------------------------------------------------------------------
+
+import importlib
+
+from .. import (NDJSONFileFormat,
+                DataResourceSchemaFileFormat,
+                TabularDataResourceDirFmt,
+                TableJSONLFileFormat, TableJSONLDirFmt,
+                StatsTable, Pairwise, Dist1D,
+                Matched, Independent, Ordered, Unordered, Multi,
+                NestedOrdered, NestedUnordered)
+
+from ...plugin_setup import plugin
+
+plugin.register_formats(NDJSONFileFormat, DataResourceSchemaFileFormat,
+                        TabularDataResourceDirFmt)
+plugin.register_formats(TableJSONLFileFormat, TableJSONLDirFmt)
+
+
+plugin.register_semantic_types(StatsTable, Pairwise, Dist1D,
+                               NestedOrdered, NestedUnordered, Matched,
+                               Independent, Ordered, Unordered, Multi)
+
+plugin.register_semantic_type_to_format(
+    Dist1D[Ordered | Unordered | NestedOrdered | NestedUnordered | Multi,
+           Matched | Independent] |
+    StatsTable[Pairwise],
+    TableJSONLDirFmt)
+
+importlib.import_module('._transformers', __name__)
+importlib.import_module('._validators', __name__)
+
+
+__all__ = [
+    'StatsTable', 'Pairwise', 'Dist1D', 'NestedOrdered', 'NestedUnordered',
+    'Matched', 'Independent', 'Ordered', 'Unordered', 'Multi',
+    'NDJSONFileFormat', 'DataResourceSchemaFileFormat',
+    'TabularDataResourceDirFmt', 'TableJSONLFileFormat', 'TableJSONLDirFmt',
+]
diff --git a/q2_types/tabular/_deferred_setup/_transformers.py b/q2_types/tabular/_deferred_setup/_transformers.py
@@ -0,0 +1,172 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2024, QIIME 2 development team.
+#
+# Distributed under the terms of the Modified BSD License.
+#
+# The full license is in the file LICENSE, distributed with this software.
+# ----------------------------------------------------------------------------
+
+import pandas as pd
+import frictionless as fls
+import json
+
+from ..formats import TableJSONLFileFormat
+
+from .. import (NDJSONFileFormat,
+                DataResourceSchemaFileFormat,
+                TabularDataResourceDirFmt)
+
+from ...plugin_setup import plugin
+
+
+def table_jsonl_header(df: pd.DataFrame) -> str:
+    header = {}
+    header['doctype'] = dict(
+        name='table.jsonl', format='application/x-json-lines', version='1.0')
+    header['direction'] = 'row'
+    header['style'] = 'key:value'
+
+    fields = []
+    for name in df.columns:
+        attrs = df[name].attrs.copy()
+        title = attrs.pop('title', '')
+        description = attrs.pop('description', '')
+        type = attrs.pop('type', None)
+        missing = attrs.pop('missing', False)
+        extra = attrs.pop('extra', None)
+        if extra is None:
+            extra = attrs
+        fields.append(dict(
+            name=name, type=type, missing=missing, title=title,
+            description=description, extra=extra))
+
+    header['fields'] = fields
+    header['index'] = []
+    header['title'] = df.attrs.get('title', '')
+    header['description'] = df.attrs.get('description', '')
+    header['extra'] = df.attrs.get('extra', {})
+
+    # prevent whitespace after comma and colon
+    return json.dumps(header, separators=(',', ':'))
+
+
+@plugin.register_transformer
+def table_jsonl_to_df(ff: TableJSONLFileFormat) -> pd.DataFrame:
+    with ff.open() as fh:
+        header = json.loads(next(fh))
+        df = pd.read_json(fh, lines=True, orient='records')
+        if df.empty:
+            df = pd.DataFrame(columns=[
+                spec['name'] for spec in header['fields']])
+
+    # The order of these steps matters.
+
+    # 1. set order of columns
+    df = df[[spec['name'] for spec in header['fields']]]
+
+    # 2. update types
+    for spec in header['fields']:
+        col = spec['name']
+        if spec['type'] == 'integer':
+            df[col] = df[col].astype('int64')
+        elif spec['type'] == 'number':
+            df[col] = df[col].astype('float64')
+        elif spec['type'] == 'datetime':
+            df[col] = pd.to_datetime(df[col], format='iso8601')
+        elif spec['type'] == 'date':
+            df[col] = pd.to_datetime(df[col], format='iso8601')
+        elif spec['type'] == 'time':
+            df[col] = pd.to_datetime(df[col], format='mixed').dt.time
+        elif spec['type'] == 'duration':
+            df[col] = pd.to_timedelta(df[col])
+
+    # 3. set index
+    if len(header['index']) > 0:
+        df = df.set_index(header['index'], drop=False)
+
+    # 4. add metadata to columns
+    for spec in header['fields']:
+        df[spec['name']].attrs.update(spec)
+
+    # 5. add metadata to table
+    attrs = dict(title=header['title'], description=header['description'])
+    df.attrs.update(attrs)
+
+    return df
+
+
+@plugin.register_transformer
+def df_to_table_jsonl(obj: pd.DataFrame) -> TableJSONLFileFormat:
+    header = table_jsonl_header(obj)
+
+    ff = TableJSONLFileFormat()
+    with ff.open() as fh:
+        fh.write(header)
+        fh.write('\n')
+        if not obj.empty:
+            obj.to_json(fh, orient='records', lines=True, date_format='iso')
+
+    return ff
+
+
+@plugin.register_transformer
+def _1(obj: pd.DataFrame) -> NDJSONFileFormat:
+    ff = NDJSONFileFormat()
+    obj.to_json(str(ff), lines=True, orient='records')
+    return ff
+
+
+@plugin.register_transformer
+def _2(obj: DataResourceSchemaFileFormat) -> fls.Resource:
+    return fls.Resource(str(obj))
+
+
+@plugin.register_transformer
+def _3(df: TabularDataResourceDirFmt) -> pd.DataFrame:
+    path = df.data.view(NDJSONFileFormat)
+    data = pd.read_json(str(path), lines=True)
+    resource = df.metadata.view(fls.Resource)
+
+    if data.empty:
+        data = pd.DataFrame(
+            columns=[c.name for c in resource.schema.fields])
+
+    for field in resource.schema.fields:
+        data[field.name].attrs = field.to_dict()
+
+    return data
+
+
+@plugin.register_transformer
+def _4(obj: pd.DataFrame) -> TabularDataResourceDirFmt:
+    metadata_obj = []
+
+    for col in obj.columns:
+        series = obj[col]
+        dtype = series.convert_dtypes().dtype
+        metadata = series.attrs.copy()
+
+        if pd.api.types.is_float_dtype(dtype):
+            schema_dtype = 'number'
+        elif pd.api.types.is_integer_dtype(dtype):
+            schema_dtype = 'integer'
+        else:
+            schema_dtype = 'string'
+
+        metadata['name'] = col
+        metadata['type'] = schema_dtype
+
+        metadata_obj.append(metadata)
+
+    metadata_dict = {'schema': {'fields': metadata_obj}, **obj.attrs}
+    metadata_dict['format'] = 'ndjson'
+    metadata_dict['path'] = 'data.ndjson'
+    metadata_dict['name'] = 'data'
+
+    dir_fmt = TabularDataResourceDirFmt()
+
+    dir_fmt.data.write_data(obj, pd.DataFrame)
+    with open(dir_fmt.path / 'dataresource.json', 'w') as fh:
+        fh.write(json.dumps(metadata_dict, indent=4))
+
+    return dir_fmt
diff --git a/q2_types/tabular/_deferred_setup/_validators.py b/q2_types/tabular/_deferred_setup/_validators.py
@@ -0,0 +1,46 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2022-2023, QIIME 2 development team.
+#
+# Distributed under the terms of the Modified BSD License.
+#
+# The full license is in the file LICENSE, distributed with this software.
+# ----------------------------------------------------------------------------
+
+import pandas as pd
+
+from qiime2.plugin import ValidationError
+from .. import (Dist1D, Ordered, Unordered, NestedOrdered,
+                NestedUnordered, Matched, Independent)
+from ...plugin_setup import plugin
+
+
+@plugin.register_validator(Dist1D[Ordered | Unordered,
+                           Matched | Independent])
+def validate_all_dist_columns_present(data: pd.DataFrame, level):
+    req_cols = ['id', 'measure', 'group']
+    for col in req_cols:
+        if col not in data.columns:
+            raise ValidationError(f'"{col}" not found in distribution.')
+
+
+@plugin.register_validator(Dist1D[Ordered | Unordered, Matched])
+def validate_unique_subjects_within_group(data: pd.DataFrame, level):
+    if 'subject' not in data.columns:
+        raise ValidationError('"subject" not found in distribution.')
+
+    for group_id, group_df in data.groupby('group'):
+        if group_df['subject'].duplicated().any():
+            dupes = list(group_df['subject'][group_df['subject'].duplicated()])
+            raise ValidationError(
+                'Unique subject found more than once within an individual'
+                ' group. Group(s) where duplicated subject was found:'
+                f' [{group_id}] Duplicated subjects: {dupes}')
+
+
+@plugin.register_validator(Dist1D[NestedOrdered | NestedUnordered,
+                           Matched | Independent])
+def validate_all_nesteddist_columns_present(data: pd.DataFrame, level):
+    req_cols = ['id', 'measure', 'group', 'class', "level"]
+    for col in req_cols:
+        if col not in data.columns:
+            raise ValidationError(f'"{col}" not found in distribution.')
diff --git a/q2_types/tabular/formats.py b/q2_types/tabular/formats.py
@@ -0,0 +1,49 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2024, QIIME 2 development team.
+#
+# Distributed under the terms of the Modified BSD License.
+#
+# The full license is in the file LICENSE, distributed with this software.
+# ----------------------------------------------------------------------------
+
+from qiime2.plugin import ValidationError, model
+
+from frictionless import validate
+
+
+class TableJSONLFileFormat(model.TextFileFormat):
+    def _validate_(self, level):
+        with self.open() as fh:
+            assert fh.read(33)[:33] == '{"doctype":{"name":"table.jsonl",'
+
+
+TableJSONLDirFmt = model.SingleFileDirectoryFormat(
+    'TableJSONLDirFmt', 'data.table.jsonl', TableJSONLFileFormat)
+
+
+class NDJSONFileFormat(model.TextFileFormat):
+    """Format for newline-delimited (ND) JSON file."""
+    def _validate_(self, level):
+        pass
+
+
+class DataResourceSchemaFileFormat(model.TextFileFormat):
+    """
+    Format for data resource schema.
+    """
+    def _validate_(self, level):
+        pass
+
+
+class TabularDataResourceDirFmt(model.DirectoryFormat):
+    data = model.File('data.ndjson', format=NDJSONFileFormat)
+    metadata = model.File('dataresource.json',
+                          format=DataResourceSchemaFileFormat)
+
+    def _validate_(self, level='min'):
+        try:
+            validate(str(self.path/'dataresource.json'))
+        except ValidationError:
+            raise model.ValidationError(
+                'The dataresource does not completely describe'
+                ' the data.ndjson file')
diff --git a/q2_types/tabular/tests/__init__.py b/q2_types/tabular/tests/__init__.py
@@ -0,0 +1,7 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2024, QIIME 2 development team.
+#
+# Distributed under the terms of the Modified BSD License.
+#
+# The full license is in the file LICENSE, distributed with this software.
+# ----------------------------------------------------------------------------
diff --git a/q2_types/tabular/tests/data/empty_data_dist.table.jsonl b/q2_types/tabular/tests/data/empty_data_dist.table.jsonl
@@ -0,0 +1 @@
+{"doctype":{"name":"table.jsonl","format":"application/x-json-lines","version":"1.0"},"direction":"row","style":"key:value","fields":[{"name":"id","type":"string","missing":false,"title":"id","description":"...","extra":{"name":"id"}},{"name":"measure","type":"number","missing":false,"title":"faith_pd","description":"...","extra":{"name":"measure"}},{"name":"group","type":"integer","missing":false,"title":"week","description":"...","extra":{"name":"group"}},{"name":"subject","type":"string","missing":false,"title":"SubjectID","description":"...","extra":{"name":"subject"}}],"index":[],"title":"","description":"","extra":{}}
diff --git a/q2_types/tabular/tests/data/empty_data_dist/data.ndjson b/q2_types/tabular/tests/data/empty_data_dist/data.ndjson
diff --git a/q2_types/tabular/tests/data/empty_data_dist/dataresource.json b/q2_types/tabular/tests/data/empty_data_dist/dataresource.json
@@ -0,0 +1,33 @@
+{
+  "schema": {
+    "fields": [
+      {
+        "title": "id",
+        "description": "...",
+        "name": "id",
+        "type": "string"
+      },
+      {
+        "title": "faith_pd",
+        "description": "...",
+        "name": "measure",
+        "type": "number"
+      },
+      {
+        "title": "week",
+        "description": "...",
+        "name": "group",
+        "type": "integer"
+      },
+      {
+        "title": "SubjectID",
+        "description": "...",
+        "name": "subject",
+        "type": "string"
+      }
+    ]
+  },
+  "format": "ndjson",
+  "path": "data.ndjson",
+  "name": "data"
+}
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		{"doctype":{"name":"table.jsonl","format":"application/x-json-lines","version":"1.0"},"direction":"row","style":"key:value","fields":[{"name":"id","type":"string","missing":false,"title":"id","description":"...","extra":{"name":"id"}},{"name":"measure","type":"number","missing":false,"title":"faith_pd","description":"...","extra":{"name":"measure"}},{"name":"group","type":"integer","missing":false,"title":"week","description":"...","extra":{"name":"group"}},{"name":"subject","type":"string","missing":false,"title":"SubjectID","description":"...","extra":{"name":"subject"}}],"index":[],"title":"","description":"","extra":{}}