Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Move tabular json types from stats #351

Open
wants to merge 4 commits into
base: dev
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions conda-recipe/meta.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ requirements:
- qiime2 {{ qiime2_epoch }}.*
- samtools
- pyhmmer
- frictionless<=5.5.0
build:
- setuptools
- versioningit
Expand Down
1 change: 1 addition & 0 deletions q2_types/plugin_setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -153,3 +153,4 @@
importlib.import_module('q2_types.reference_db._deferred_setup')
importlib.import_module('q2_types.sample_data._deferred_setup')
importlib.import_module('q2_types.tree._deferred_setup')
importlib.import_module('q2_types.tabular._deferred_setup')
20 changes: 20 additions & 0 deletions q2_types/tabular/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
# ----------------------------------------------------------------------------
# Copyright (c) 2024, QIIME 2 development team.
#
# Distributed under the terms of the Modified BSD License.
#
# The full license is in the file LICENSE, distributed with this software.
# ----------------------------------------------------------------------------

from .formats import (TableJSONLFileFormat, TableJSONLDirFmt,
NDJSONFileFormat, DataResourceSchemaFileFormat,
TabularDataResourceDirFmt)
from .types import (StatsTable, Pairwise, Dist1D, Ordered, Unordered,
NestedOrdered, NestedUnordered, Multi,
Matched, Independent)

__all__ = ['TableJSONLFileFormat', 'TableJSONLDirFmt',
'NDJSONFileFormat', 'DataResourceSchemaFileFormat',
'TabularDataResourceDirFmt', 'StatsTable', 'Pairwise',
'Dist1D', 'Ordered', 'Unordered', 'NestedOrdered',
'NestedUnordered', 'Multi', 'Matched', 'Independent']
45 changes: 45 additions & 0 deletions q2_types/tabular/_deferred_setup/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
# ----------------------------------------------------------------------------
# Copyright (c) 2024, QIIME 2 development team.
#
# Distributed under the terms of the Modified BSD License.
#
# The full license is in the file LICENSE, distributed with this software.
# ----------------------------------------------------------------------------

import importlib

from .. import (NDJSONFileFormat,
DataResourceSchemaFileFormat,
TabularDataResourceDirFmt,
TableJSONLFileFormat, TableJSONLDirFmt,
StatsTable, Pairwise, Dist1D,
Matched, Independent, Ordered, Unordered, Multi,
NestedOrdered, NestedUnordered)

from ...plugin_setup import plugin

plugin.register_formats(NDJSONFileFormat, DataResourceSchemaFileFormat,
TabularDataResourceDirFmt)
plugin.register_formats(TableJSONLFileFormat, TableJSONLDirFmt)


plugin.register_semantic_types(StatsTable, Pairwise, Dist1D,
NestedOrdered, NestedUnordered, Matched,
Independent, Ordered, Unordered, Multi)

plugin.register_semantic_type_to_format(
Dist1D[Ordered | Unordered | NestedOrdered | NestedUnordered | Multi,
Matched | Independent] |
StatsTable[Pairwise],
TableJSONLDirFmt)

importlib.import_module('._transformers', __name__)
importlib.import_module('._validators', __name__)


__all__ = [
'StatsTable', 'Pairwise', 'Dist1D', 'NestedOrdered', 'NestedUnordered',
'Matched', 'Independent', 'Ordered', 'Unordered', 'Multi',
'NDJSONFileFormat', 'DataResourceSchemaFileFormat',
'TabularDataResourceDirFmt', 'TableJSONLFileFormat', 'TableJSONLDirFmt',
]
172 changes: 172 additions & 0 deletions q2_types/tabular/_deferred_setup/_transformers.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,172 @@
# ----------------------------------------------------------------------------
# Copyright (c) 2024, QIIME 2 development team.
#
# Distributed under the terms of the Modified BSD License.
#
# The full license is in the file LICENSE, distributed with this software.
# ----------------------------------------------------------------------------

import pandas as pd
import frictionless as fls
import json

from ..formats import TableJSONLFileFormat

from .. import (NDJSONFileFormat,
DataResourceSchemaFileFormat,
TabularDataResourceDirFmt)

from ...plugin_setup import plugin


def table_jsonl_header(df: pd.DataFrame) -> str:
header = {}
header['doctype'] = dict(
name='table.jsonl', format='application/x-json-lines', version='1.0')
header['direction'] = 'row'
header['style'] = 'key:value'

fields = []
for name in df.columns:
attrs = df[name].attrs.copy()
title = attrs.pop('title', '')
description = attrs.pop('description', '')
type = attrs.pop('type', None)
missing = attrs.pop('missing', False)
extra = attrs.pop('extra', None)
if extra is None:
extra = attrs
fields.append(dict(
name=name, type=type, missing=missing, title=title,
description=description, extra=extra))

header['fields'] = fields
header['index'] = []
header['title'] = df.attrs.get('title', '')
header['description'] = df.attrs.get('description', '')
header['extra'] = df.attrs.get('extra', {})

# prevent whitespace after comma and colon
return json.dumps(header, separators=(',', ':'))


@plugin.register_transformer
def table_jsonl_to_df(ff: TableJSONLFileFormat) -> pd.DataFrame:
with ff.open() as fh:
header = json.loads(next(fh))
df = pd.read_json(fh, lines=True, orient='records')
if df.empty:
df = pd.DataFrame(columns=[
spec['name'] for spec in header['fields']])

# The order of these steps matters.

# 1. set order of columns
df = df[[spec['name'] for spec in header['fields']]]

# 2. update types
for spec in header['fields']:
col = spec['name']
if spec['type'] == 'integer':
df[col] = df[col].astype('int64')
elif spec['type'] == 'number':
df[col] = df[col].astype('float64')
elif spec['type'] == 'datetime':
df[col] = pd.to_datetime(df[col], format='iso8601')
elif spec['type'] == 'date':
df[col] = pd.to_datetime(df[col], format='iso8601')
elif spec['type'] == 'time':
df[col] = pd.to_datetime(df[col], format='mixed').dt.time
elif spec['type'] == 'duration':
df[col] = pd.to_timedelta(df[col])

# 3. set index
if len(header['index']) > 0:
df = df.set_index(header['index'], drop=False)

# 4. add metadata to columns
for spec in header['fields']:
df[spec['name']].attrs.update(spec)

# 5. add metadata to table
attrs = dict(title=header['title'], description=header['description'])
df.attrs.update(attrs)

return df


@plugin.register_transformer
def df_to_table_jsonl(obj: pd.DataFrame) -> TableJSONLFileFormat:
header = table_jsonl_header(obj)

ff = TableJSONLFileFormat()
with ff.open() as fh:
fh.write(header)
fh.write('\n')
if not obj.empty:
obj.to_json(fh, orient='records', lines=True, date_format='iso')

return ff


@plugin.register_transformer
def _1(obj: pd.DataFrame) -> NDJSONFileFormat:
ff = NDJSONFileFormat()
obj.to_json(str(ff), lines=True, orient='records')
return ff


@plugin.register_transformer
def _2(obj: DataResourceSchemaFileFormat) -> fls.Resource:
return fls.Resource(str(obj))


@plugin.register_transformer
def _3(df: TabularDataResourceDirFmt) -> pd.DataFrame:
path = df.data.view(NDJSONFileFormat)
data = pd.read_json(str(path), lines=True)
resource = df.metadata.view(fls.Resource)

if data.empty:
data = pd.DataFrame(
columns=[c.name for c in resource.schema.fields])

for field in resource.schema.fields:
data[field.name].attrs = field.to_dict()

return data


@plugin.register_transformer
def _4(obj: pd.DataFrame) -> TabularDataResourceDirFmt:
metadata_obj = []

for col in obj.columns:
series = obj[col]
dtype = series.convert_dtypes().dtype
metadata = series.attrs.copy()

if pd.api.types.is_float_dtype(dtype):
schema_dtype = 'number'
elif pd.api.types.is_integer_dtype(dtype):
schema_dtype = 'integer'
else:
schema_dtype = 'string'

metadata['name'] = col
metadata['type'] = schema_dtype

metadata_obj.append(metadata)

metadata_dict = {'schema': {'fields': metadata_obj}, **obj.attrs}
metadata_dict['format'] = 'ndjson'
metadata_dict['path'] = 'data.ndjson'
metadata_dict['name'] = 'data'

dir_fmt = TabularDataResourceDirFmt()

dir_fmt.data.write_data(obj, pd.DataFrame)
with open(dir_fmt.path / 'dataresource.json', 'w') as fh:
fh.write(json.dumps(metadata_dict, indent=4))

return dir_fmt
46 changes: 46 additions & 0 deletions q2_types/tabular/_deferred_setup/_validators.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
# ----------------------------------------------------------------------------
# Copyright (c) 2022-2023, QIIME 2 development team.
#
# Distributed under the terms of the Modified BSD License.
#
# The full license is in the file LICENSE, distributed with this software.
# ----------------------------------------------------------------------------

import pandas as pd

from qiime2.plugin import ValidationError
from .. import (Dist1D, Ordered, Unordered, NestedOrdered,
NestedUnordered, Matched, Independent)
from ...plugin_setup import plugin


@plugin.register_validator(Dist1D[Ordered | Unordered,
Matched | Independent])
def validate_all_dist_columns_present(data: pd.DataFrame, level):
req_cols = ['id', 'measure', 'group']
for col in req_cols:
if col not in data.columns:
raise ValidationError(f'"{col}" not found in distribution.')


@plugin.register_validator(Dist1D[Ordered | Unordered, Matched])
def validate_unique_subjects_within_group(data: pd.DataFrame, level):
if 'subject' not in data.columns:
raise ValidationError('"subject" not found in distribution.')

for group_id, group_df in data.groupby('group'):
if group_df['subject'].duplicated().any():
dupes = list(group_df['subject'][group_df['subject'].duplicated()])
raise ValidationError(
'Unique subject found more than once within an individual'
' group. Group(s) where duplicated subject was found:'
f' [{group_id}] Duplicated subjects: {dupes}')


@plugin.register_validator(Dist1D[NestedOrdered | NestedUnordered,
Matched | Independent])
def validate_all_nesteddist_columns_present(data: pd.DataFrame, level):
req_cols = ['id', 'measure', 'group', 'class', "level"]
for col in req_cols:
if col not in data.columns:
raise ValidationError(f'"{col}" not found in distribution.')
49 changes: 49 additions & 0 deletions q2_types/tabular/formats.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
# ----------------------------------------------------------------------------
# Copyright (c) 2024, QIIME 2 development team.
#
# Distributed under the terms of the Modified BSD License.
#
# The full license is in the file LICENSE, distributed with this software.
# ----------------------------------------------------------------------------

from qiime2.plugin import ValidationError, model

from frictionless import validate


class TableJSONLFileFormat(model.TextFileFormat):
def _validate_(self, level):
with self.open() as fh:
assert fh.read(33)[:33] == '{"doctype":{"name":"table.jsonl",'


TableJSONLDirFmt = model.SingleFileDirectoryFormat(
'TableJSONLDirFmt', 'data.table.jsonl', TableJSONLFileFormat)


class NDJSONFileFormat(model.TextFileFormat):
"""Format for newline-delimited (ND) JSON file."""
def _validate_(self, level):
pass


class DataResourceSchemaFileFormat(model.TextFileFormat):
"""
Format for data resource schema.
"""
def _validate_(self, level):
pass


class TabularDataResourceDirFmt(model.DirectoryFormat):
data = model.File('data.ndjson', format=NDJSONFileFormat)
metadata = model.File('dataresource.json',
format=DataResourceSchemaFileFormat)

def _validate_(self, level='min'):
try:
validate(str(self.path/'dataresource.json'))
except ValidationError:
raise model.ValidationError(
'The dataresource does not completely describe'
' the data.ndjson file')
7 changes: 7 additions & 0 deletions q2_types/tabular/tests/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
# ----------------------------------------------------------------------------
# Copyright (c) 2024, QIIME 2 development team.
#
# Distributed under the terms of the Modified BSD License.
#
# The full license is in the file LICENSE, distributed with this software.
# ----------------------------------------------------------------------------
1 change: 1 addition & 0 deletions q2_types/tabular/tests/data/empty_data_dist.table.jsonl
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"doctype":{"name":"table.jsonl","format":"application/x-json-lines","version":"1.0"},"direction":"row","style":"key:value","fields":[{"name":"id","type":"string","missing":false,"title":"id","description":"...","extra":{"name":"id"}},{"name":"measure","type":"number","missing":false,"title":"faith_pd","description":"...","extra":{"name":"measure"}},{"name":"group","type":"integer","missing":false,"title":"week","description":"...","extra":{"name":"group"}},{"name":"subject","type":"string","missing":false,"title":"SubjectID","description":"...","extra":{"name":"subject"}}],"index":[],"title":"","description":"","extra":{}}
Empty file.
33 changes: 33 additions & 0 deletions q2_types/tabular/tests/data/empty_data_dist/dataresource.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
{
"schema": {
"fields": [
{
"title": "id",
"description": "...",
"name": "id",
"type": "string"
},
{
"title": "faith_pd",
"description": "...",
"name": "measure",
"type": "number"
},
{
"title": "week",
"description": "...",
"name": "group",
"type": "integer"
},
{
"title": "SubjectID",
"description": "...",
"name": "subject",
"type": "string"
}
]
},
"format": "ndjson",
"path": "data.ndjson",
"name": "data"
}
Loading
Loading