diff --git a/docs/api_reference/dataset/tabledoc.md b/docs/api_reference/dataset/tabledoc.md new file mode 100644 index 00000000..f3a73929 --- /dev/null +++ b/docs/api_reference/dataset/tabledoc.md @@ -0,0 +1,3 @@ +# tabledoc + +::: tripper.dataset.tabledoc diff --git a/pyproject.toml b/pyproject.toml index d5f7f94a..0398f0a0 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -104,7 +104,7 @@ max-public-methods = 25 max-locals = 20 disable = [ "fixme", - "too-many-positional-arguments", + "invalid-name", ] good-names = [ # Default @@ -115,8 +115,8 @@ good-names = [ "s", "p", "o", # Namespaces "EX", - # dict, value, file, ... - "d", "v", "f", + # dict, value, file, keyword... + "d", "v", "f", "kw", ] [tool.pytest.ini_options] diff --git a/tests/dataset/dataset_paths.py b/tests/dataset/dataset_paths.py new file mode 100644 index 00000000..e84b2f47 --- /dev/null +++ b/tests/dataset/dataset_paths.py @@ -0,0 +1,12 @@ +"""Defines paths for tests. + +It defines some directories and some utility functions that can be used +with or without conftest. +""" + +from pathlib import Path + +testdir = Path(__file__).resolve().parent.parent +ontodir = testdir / "ontologies" +indir = testdir / "input" +outdir = testdir / "output" diff --git a/tests/dataset/test_dataaccess.py b/tests/dataset/test_dataaccess.py index c3a7b75d..af058440 100644 --- a/tests/dataset/test_dataaccess.py +++ b/tests/dataset/test_dataaccess.py @@ -2,24 +2,19 @@ # pylint: disable=invalid-name,too-many-locals,duplicate-code -from pathlib import Path - import pytest pytest.importorskip("yaml") pytest.importorskip("requests") -thisdir = Path(__file__).resolve().parent -testdir = thisdir.parent -inputdir = testdir / "input" -outputdir = testdir / "output" - # if True: def test_save_and_load(): """Test save() and load().""" # pylint: disable=too-many-statements + from dataset_paths import outdir # pylint: disable=import-error + from tripper import DCAT, DCTERMS, EMMO, Triplestore from tripper.dataset import load, load_dict, save, save_dict @@ -38,7 +33,6 @@ def test_save_and_load(): # Test save dict save_dict( ts, - type="dataset", dct={ "@id": SEMDATA.img1, "distribution": { @@ -49,6 +43,7 @@ def test_save_and_load(): "format": "tiff", }, }, + type="dataset", ) newdistr = load_dict(ts, SEMDATA.img1) assert newdistr["@type"] == [DCAT.Dataset, EMMO.DataSet] @@ -57,12 +52,12 @@ def test_save_and_load(): save_dict( ts, - type="generator", dct={ "@id": GEN.sem_hitachi, "generatorType": "application/vnd.dlite-generate", "configuration": {"driver": "hitachi"}, }, + type="generator", ) # Test load dataset (this downloads an actual image from github) @@ -70,7 +65,7 @@ def test_save_and_load(): assert len(data) == 53502 # Test save dataset with anonymous distribution - newfile = outputdir / "newimage.tiff" + newfile = outdir / "newimage.tiff" newfile.unlink(missing_ok=True) buf = b"some bytes..." save( @@ -94,7 +89,7 @@ def test_save_and_load(): assert newimage.distribution.downloadURL == f"file:{newfile}" # Test save dataset with named distribution - newfile2 = outputdir / "newimage.png" + newfile2 = outdir / "newimage.png" newfile2.unlink(missing_ok=True) save( ts, diff --git a/tests/dataset/test_dataset.py b/tests/dataset/test_dataset.py index 105db426..9bdec2c6 100644 --- a/tests/dataset/test_dataset.py +++ b/tests/dataset/test_dataset.py @@ -2,18 +2,11 @@ # pylint: disable=invalid-name,too-many-locals,duplicate-code -from pathlib import Path - import pytest pytest.importorskip("yaml") pytest.importorskip("requests") -thisdir = Path(__file__).resolve().parent -testdir = thisdir.parent -inputdir = testdir / "input" -outputdir = testdir / "output" - def test_get_jsonld_context(): """Test get_jsonld_context().""" @@ -73,12 +66,31 @@ def test_add(): from tripper.dataset.dataset import add d = {} - add(d, "a", 1) - add(d, "b", 1) - add(d, "b", 1) - add(d, "a", 2) - add(d, "a", 1) - assert d == {"a": [1, 2], "b": 1} + add(d, "a", "1") + add(d, "b", "1") + add(d, "b", "1") + add(d, "a", "2") + add(d, "a", "1") + add(d, "a", {"c": "3"}) + assert d == {"a": ["1", "2", {"c": "3"}], "b": "1"} + + +def test_addnested(): + """Test help-function addnested().""" + from tripper.dataset.dataset import addnested + from tripper.utils import AttrDict + + d = AttrDict() + addnested(d, "a.b", "1") + assert d == {"a": {"b": "1"}} + + addnested(d, "a", "2") + assert d == {"a": ["2", {"b": "1"}]} + + addnested(d, "a.b.c", {"d": "3"}) + assert d.a[0] == "2" + assert d.a[1].b[1].c == {"d": "3"} + assert d == {"a": ["2", {"b": ["1", {"c": {"d": "3"}}]}]} def test_get(): @@ -115,6 +127,8 @@ def test_datadoc(): """Test save_datadoc() and load_dict()/save_dict().""" # pylint: disable=too-many-statements + from dataset_paths import indir # pylint: disable=import-error + from tripper import CHAMEO, DCAT, EMMO, OTEIO, Triplestore from tripper.dataset import load_dict, save_datadoc, save_dict, search_iris @@ -124,7 +138,7 @@ def test_datadoc(): ts = Triplestore("rdflib") # Load data documentation into triplestore - datadoc = save_datadoc(ts, inputdir / "semdata.yaml") + datadoc = save_datadoc(ts, indir / "semdata.yaml") assert isinstance(datadoc, dict) assert "@context" in datadoc @@ -167,8 +181,8 @@ def test_datadoc(): # Test save dict save_dict( ts, - "distribution", - {"@id": SEMDATA.newdistr, "format": "txt"}, + dct={"@id": SEMDATA.newdistr, "format": "txt"}, + type="distribution", prefixes={"echem": "https://w3id.org/emmo/domain/electrochemistry"}, ) newdistr = load_dict(ts, SEMDATA.newdistr) @@ -206,11 +220,13 @@ def test_pipeline(): from tripper import Triplestore otelib = pytest.importorskip("otelib") + from dataset_paths import indir # pylint: disable=import-error + from tripper.dataset import get_partial_pipeline, save_datadoc # Prepare triplestore ts = Triplestore("rdflib") - save_datadoc(ts, inputdir / "semdata.yaml") + save_datadoc(ts, indir / "semdata.yaml") SEMDATA = ts.namespaces["semdata"] diff --git a/tests/dataset/test_tabledoc.py b/tests/dataset/test_tabledoc.py new file mode 100644 index 00000000..278e7881 --- /dev/null +++ b/tests/dataset/test_tabledoc.py @@ -0,0 +1,83 @@ +"""Test the TableDoc class.""" + +import pytest + + +# if True: +def test_as_dicts(): + """Test the as_dicts() method.""" + + pytest.importorskip("rdflib") + + from tripper import DCAT, EMMO, Namespace, Triplestore + from tripper.dataset import TableDoc + + ONTO = Namespace("http:/example.com/onto#") + DS = Namespace("http:/example.com/datasets#") + + td = TableDoc( + header=[ + "@id", + "@type", + "@type", + "inSeries", + "distribution.downloadURL", + ], + data=[ + ("ds:s1", "onto:T1", "onto:T2", None, "file:///data/"), + ("ds:d1", "onto:T1", None, "ds:s1", "file:///data/d1.txt"), + ("ds:d2", "onto:T2", None, "ds:s1", "file:///data/d2.txt"), + ], + prefixes={ + "onto": "http:/example.com/onto#", + "ds": "http:/example.com/datasets#", + }, + # Replace the "ds" prefix above with this, once the "context" keyword + # argument is fully implemented. + # context={ + # "ds": "http:/example.com/datasets#", + # }, + ) + + s1, d1, d2 = td.asdicts() # pylint: disable=unbalanced-tuple-unpacking + + assert s1["@id"] == DS.s1 + assert set(s1["@type"]) == { + DCAT.Dataset, + EMMO.DataSet, + ONTO.T1, + ONTO.T2, + } + assert "inSeries" not in s1 + assert s1.distribution == { + "@type": DCAT.Distribution, + "downloadURL": "file:///data/", + } + + assert d1["@id"] == DS.d1 + assert set(d1["@type"]) == { + DCAT.Dataset, + EMMO.DataSet, + ONTO.T1, + } + assert d1.inSeries == DS.s1 + assert d1.distribution == { + "@type": DCAT.Distribution, + "downloadURL": "file:///data/d1.txt", + } + + assert d2["@id"] == DS.d2 + assert set(d2["@type"]) == { + DCAT.Dataset, + EMMO.DataSet, + ONTO.T2, + } + assert d2.inSeries == DS.s1 + assert d2.distribution == { + "@type": DCAT.Distribution, + "downloadURL": "file:///data/d2.txt", + } + + ts = Triplestore(backend="rdflib") + td.save(ts) + print(ts.serialize()) diff --git a/tests/input/semdata.csv b/tests/input/semdata.csv new file mode 100644 index 00000000..631d9e69 --- /dev/null +++ b/tests/input/semdata.csv @@ -0,0 +1,5 @@ +@id;@type;title;description;creator;contactPoint;inSeries;datamodel;datamodelStorage;distribution.downloadURL;distribution.mediaType;distribution.parser;fromSample;isDescriptionOf +semdata:SEM_cement_batch2/77600-23-001/77600-23-001_5kV_400x_m001;sem:SEMImage;SEM image of cement;Back-scattered SEM image of cement sample 77600 from Heidelberg, polished with 1 µm diamond compound.;Sigurd Wenner;Sigurd Wenner ;semdata:SEM_cement_batch2/77600-23-001;http://onto-ns.com/meta/matchmaker/0.2/SEMImage;https://github.com/HEU-MatCHMaker/DataDocumentation/blob/master/SEM/datamodels/SEMImage.yaml;https://github.com/EMMC-ASBL/tripper/raw/refs/heads/dataset/tests/input/77600-23-001_5kV_400x_m001.tif;image/tiff;parser:sem_hitachi;sample:SEM_cement_batch2/77600-23-001;mat:concrete1 +semdata:SEM_cement_batch2/77600-23-001;sem:SEMImageSeries;Series of SEM image of cement sample 77600;Back-scattered SEM image of cement sample 77600, polished with 1 µm diamond compound.;Sigurd Wenner;Sigurd Wenner ;semdata:SEM_cement_batch2; ;;sftp://nas.aimen.es/P_MATCHMAKER_SHARE_SINTEF/SEM_cement_batch2/77600-23-001;inode/directory;;; +semdata:SEM_cement_batch2;sem:SEMImageSeries;Nested series of SEM images of cement batch2;…;Sigurd Wenner;Sigurd Wenner ; ;;;sftp://nas.aimen.es/P_MATCHMAKER_SHARE_SINTEF/SEM_cement_batch2;inode/directory;;; +mple:SEM_cement_batch2/77600-23-001;chameo:Sample;Series for SEM images for sample 77600-23-001.; ;;;;;;;;;; diff --git a/tripper/dataset/__init__.py b/tripper/dataset/__init__.py index d6435b8d..e0b53d58 100644 --- a/tripper/dataset/__init__.py +++ b/tripper/dataset/__init__.py @@ -12,3 +12,4 @@ save_dict, search_iris, ) +from .tabledoc import TableDoc diff --git a/tripper/dataset/dataaccess.py b/tripper/dataset/dataaccess.py index 672b2a59..3e248e36 100644 --- a/tripper/dataset/dataaccess.py +++ b/tripper/dataset/dataaccess.py @@ -175,9 +175,9 @@ def save( # Update triplestore ts.add_triples(triples) if save_dataset: - save_dict(ts, "dataset", dataset, prefixes=prefixes) + save_dict(ts, dataset, "dataset", prefixes=prefixes) elif save_distribution: - save_dict(ts, "distribution", distribution, prefixes=prefixes) + save_dict(ts, distribution, "distribution", prefixes=prefixes) return dataset["@id"] diff --git a/tripper/dataset/dataset.py b/tripper/dataset/dataset.py index 43519d4e..bbb4a178 100644 --- a/tripper/dataset/dataset.py +++ b/tripper/dataset/dataset.py @@ -42,9 +42,6 @@ from pathlib import Path from typing import TYPE_CHECKING -import requests -import yaml # type: ignore - from tripper import DCAT, EMMO, OTEIO, OWL, RDF, Triplestore from tripper.utils import AttrDict, as_python @@ -105,8 +102,8 @@ def save_dict( ts: Triplestore, - type: str, dct: dict, + type: str = "dataset", prefixes: "Optional[dict]" = None, **kwargs, ) -> dict: @@ -115,9 +112,11 @@ def save_dict( Arguments: ts: Triplestore to save to. - type: Type of dict to save. Should be one of: "dataset", - "distribution", "parser" or "generator". dct: Dict with data to save. + type: Type of data to save. Should either be one of the + pre-defined names: "dataset", "distribution", "accessService", + "parser" and "generator" or an IRI to a class in an ontology. + Defaults to "dataset". prefixes: Dict with prefixes in addition to those included in the JSON-LD context. Should map namespace prefixes to IRIs. kwargs: Additional keyword arguments to add to the returned dict. @@ -174,6 +173,8 @@ def save_extra_content(ts: Triplestore, dct: dict) -> None: - data models (require that DLite is installed) """ + import requests + # Save statements and mappings statements = get_values(dct, "statements") statements.extend(get_values(dct, "mappings")) @@ -338,6 +339,9 @@ def get_values( return values +# TODO: update this function to take an initial argument `context`, +# which can be an URL (string), dict with raw context or a list of +# strings or dicts. @cache # type: ignore def get_jsonld_context(timeout: float = 5, fromfile: bool = True) -> dict: """Returns the JSON-LD context as a dict. @@ -351,6 +355,8 @@ def get_jsonld_context(timeout: float = 5, fromfile: bool = True) -> dict: fromfile: Whether to load the context from local file. """ + import requests + if fromfile: with open(CONTEXT_PATH[7:], "r", encoding="utf-8") as f: context = json.load(f)["@context"] @@ -360,6 +366,8 @@ def get_jsonld_context(timeout: float = 5, fromfile: bool = True) -> dict: return context +# TODO: update this to take an initial argument `context`. +# See get_jsonld_context() def get_prefixes(timeout: float = 5) -> dict: """Loads the JSON-LD context and returns a dict mapping prefixes to their namespace URL.""" @@ -372,6 +380,8 @@ def get_prefixes(timeout: float = 5) -> dict: return prefixes +# TODO: update this to take an initial argument `context`. +# See get_jsonld_context() def get_shortnames(timeout: float = 5) -> dict: """Loads the JSON-LD context and returns a dict mapping IRIs to their short names defined in the context.""" @@ -412,9 +422,64 @@ def add(d: dict, key: str, value: "Any") -> None: d[key] = value else: klst = d[key] if isinstance(d[key], list) else [d[key]] - vlst = value if isinstance(value, list) else [value] - v = list(set(klst).union(vlst)) - d[key] = v[0] if len(v) == 1 else sorted(v) + if isinstance(value, dict): + v = klst if value in klst else klst + [value] + else: + vlst = value if isinstance(value, list) else [value] + try: + v = list(set(klst).union(vlst)) + except TypeError: # klst contains unhashable dicts + v = klst + [x for x in vlst if x not in klst] + d[key] = ( + v[0] + if len(v) == 1 + else sorted( + # Sort dicts at end, by representing them with a huge + # unicode character + v, + key=lambda x: "\uffff" if isinstance(x, dict) else x, + ) + ) + + +def addnested( + d: "Union[dict, list]", key: str, value: "Any" +) -> "Union[dict, list]": + """Like add(), but allows `key` to be a dot-separated list of sub-keys. + Returns the updated `d`. + + Each sub-key will be added to `d` as a corresponding sub-dict. + + Example: + + >>> d = {} + >>> addnested(d, "a.b.c", "val") == {'a': {'b': {'c': 'val'}}} + True + + """ + if "." in key: + first, rest = key.split(".", 1) + if isinstance(d, list): + for ele in d: + if isinstance(ele, dict): + addnested(ele, key, value) + break + else: + d.append(addnested({}, key, value)) + elif first in d and isinstance(d[first], (dict, list)): + addnested(d[first], rest, value) + else: + addnested(d, first, addnested(AttrDict(), rest, value)) + elif isinstance(d, list): + for ele in d: + if isinstance(ele, dict): + add(ele, key, value) + break + else: + d.append({key: value}) + else: + add(d, key, value) + return d def get( @@ -451,6 +516,8 @@ def expand_iri(iri: str, prefixes: dict) -> str: def read_datadoc(filename: "Union[str, Path]") -> dict: """Read YAML data documentation and return it as a dict.""" + import yaml # type: ignore + with open(filename, "r", encoding="utf-8") as f: d = yaml.safe_load(f) return prepare_datadoc(d) @@ -521,6 +588,8 @@ def prepare_datadoc(datadoc: dict) -> dict: return d +# TODO: update this function to correctly handle multiple contexts +# provided with the `_context` keyword argument. def as_jsonld( dct: dict, type: "Optional[str]" = "dataset", @@ -531,8 +600,8 @@ def as_jsonld( """Return an updated copy of dict `dct` as valid JSON-LD. Arguments: - dct: Dict to return an updated copy of. - type: Type of dict to prepare. Should either be one of the + dct: Dict with data documentation to represent as JSON-LD. + type: Type of data to document. Should either be one of the pre-defined names: "dataset", "distribution", "accessService", "parser" and "generator" or an IRI to a class in an ontology. Defaults to "dataset". diff --git a/tripper/dataset/tabledoc.py b/tripper/dataset/tabledoc.py new file mode 100644 index 00000000..9fd5d988 --- /dev/null +++ b/tripper/dataset/tabledoc.py @@ -0,0 +1,68 @@ +"""Basic interface for tabular documentation of datasets.""" + +from typing import TYPE_CHECKING + +from tripper import Triplestore +from tripper.dataset.dataset import addnested, as_jsonld, save_dict +from tripper.utils import AttrDict + +if TYPE_CHECKING: # pragma: no cover + from typing import List, Optional, Sequence, Union + + +class TableDoc: + """Representation of tabular documentation of datasets. + + Arguments: + header: Sequence of column header labels. Nested data can + be represented by dot-separated label strings (e.g. + "distribution.downloadURL") + data: Sequence of rows of data. Each row documents an entry. + type: Type of data to save (applies to all rows). Should + either be one of the pre-defined names: "dataset", + "distribution", "accessService", "parser" and "generator" + or an IRI to a class in an ontology. Defaults to + "dataset". + prefixes: Dict with prefixes in addition to those included in the + JSON-LD context. Should map namespace prefixes to IRIs. + context: Dict with user-defined JSON-LD context. + + """ + + # pylint: disable=redefined-builtin,too-few-public-methods + + def __init__( + self, + header: "Sequence[str]", + data: "Sequence[Sequence[str]]", + type: "Optional[str]" = "dataset", + prefixes: "Optional[dict]" = None, + context: "Optional[Union[dict, list]]" = None, + ): + self.header = header + self.data = data + self.type = type + self.prefixes = prefixes + self.context = context + + def asdicts(self) -> "List[dict]": + """Return the table as a list of dicts.""" + kw = {"@context": self.context} if self.context else {} + + results = [] + for row in self.data: + d = AttrDict() + for i, colname in enumerate(self.header): + cell = row[i] + if cell: + addnested(d, colname, cell) + jsonld = as_jsonld( + d, type=self.type, prefixes=self.prefixes, **kw # type: ignore + ) + results.append(jsonld) + return results + + def save(self, ts: Triplestore) -> None: + """Save tabular datadocumentation to triplestore.""" + for d in self.asdicts(): + save_dict(ts, d)