From f23669878ea9d87a4929d483217f4a61959fd9ea Mon Sep 17 00:00:00 2001 From: Jesper Friis Date: Sun, 15 Dec 2024 19:00:44 +0100 Subject: [PATCH 01/13] Updated dataset, including the following changes: - Allow to add other types of entries to the triplestore that are not datasets. Ex: samples, models, instruments, people, projects... - Renamed list_data_iris() to search_iris(). It can now be use to search for all types of entries. - Renamed prepare() to as_jsonld() and made it part of the public API --- tests/dataset/test_dataset.py | 14 ++-- tests/input/semdata.yaml | 12 ++-- tripper/context/0.2/context.json | 4 +- tripper/dataset/__init__.py | 3 +- tripper/dataset/dataset.py | 120 +++++++++++++++++++------------ 5 files changed, 91 insertions(+), 62 deletions(-) diff --git a/tests/dataset/test_dataset.py b/tests/dataset/test_dataset.py index 4aa8fbdb..105db426 100644 --- a/tests/dataset/test_dataset.py +++ b/tests/dataset/test_dataset.py @@ -116,12 +116,7 @@ def test_datadoc(): # pylint: disable=too-many-statements from tripper import CHAMEO, DCAT, EMMO, OTEIO, Triplestore - from tripper.dataset import ( - list_dataset_iris, - load_dict, - save_datadoc, - save_dict, - ) + from tripper.dataset import load_dict, save_datadoc, save_dict, search_iris pytest.importorskip("dlite") pytest.importorskip("rdflib") @@ -188,20 +183,19 @@ def test_datadoc(): # Test searching the triplestore SAMPLE = ts.namespaces["sample"] - datasets = list_dataset_iris(ts) + datasets = search_iris(ts) named_datasets = { SEMDATA["SEM_cement_batch2/77600-23-001/77600-23-001_5kV_400x_m001"], SEMDATA["SEM_cement_batch2/77600-23-001"], SEMDATA["SEM_cement_batch2"], - SAMPLE["SEM_cement_batch2/77600-23-001"], } assert not named_datasets.difference(datasets) - assert set(list_dataset_iris(ts, creator="Sigurd Wenner")) == { + assert set(search_iris(ts, creator="Sigurd Wenner")) == { SEMDATA["SEM_cement_batch2/77600-23-001/77600-23-001_5kV_400x_m001"], SEMDATA["SEM_cement_batch2/77600-23-001"], SEMDATA["SEM_cement_batch2"], } - assert set(list_dataset_iris(ts, _type=CHAMEO.Sample)) == { + assert set(search_iris(ts, type=CHAMEO.Sample)) == { SAMPLE["SEM_cement_batch2/77600-23-001"], } diff --git a/tests/input/semdata.yaml b/tests/input/semdata.yaml index 0e99919f..2d1da201 100644 --- a/tests/input/semdata.yaml +++ b/tests/input/semdata.yaml @@ -1,4 +1,5 @@ --- + # This extends the list of prefixes that are already defined in the context prefixes: sem: https://w3id.com/emmo/domain/sem/0.1# @@ -62,10 +63,6 @@ datasets: downloadURL: sftp://nas.aimen.es/P_MATCHMAKER_SHARE_SINTEF/SEM_cement_batch2 mediaType: inode/directory - - "@id": sample:SEM_cement_batch2/77600-23-001 - "@type": chameo:Sample - title: Series for SEM images for sample 77600-23-001. - parsers: - "@id": parser:sem_hitachi @@ -81,3 +78,10 @@ generators: generatorType: application/vnd.dlite-generate configuration: driver: hitachi + + +# Other entities, like samples, instruments, persons, models etc... +other_entries: + - "@id": sample:SEM_cement_batch2/77600-23-001 + "@type": chameo:Sample + title: Series for SEM images for sample 77600-23-001. diff --git a/tripper/context/0.2/context.json b/tripper/context/0.2/context.json index d5903ba9..3f658c0d 100644 --- a/tripper/context/0.2/context.json +++ b/tripper/context/0.2/context.json @@ -32,8 +32,8 @@ "hasCurrentVersion": "dcat:hasCurrentVersion", "hasVersion": "dcat:hasVersion", "inSeries": { - "@id" : "dcat:inSeries", - "@type" : "@id" + "@id": "dcat:inSeries", + "@type": "@id" }, "keyword": "dcat:keyword", "landingPage": "dcat:landingPage", diff --git a/tripper/dataset/__init__.py b/tripper/dataset/__init__.py index 0a3a5088..d6435b8d 100644 --- a/tripper/dataset/__init__.py +++ b/tripper/dataset/__init__.py @@ -2,12 +2,13 @@ from .dataaccess import load, save from .dataset import ( + as_jsonld, get_jsonld_context, get_partial_pipeline, get_prefixes, - list_dataset_iris, load_dict, read_datadoc, save_datadoc, save_dict, + search_iris, ) diff --git a/tripper/dataset/dataset.py b/tripper/dataset/dataset.py index 0387328d..ba4a0fde 100644 --- a/tripper/dataset/dataset.py +++ b/tripper/dataset/dataset.py @@ -9,12 +9,13 @@ - `save_datadoc()`: Save documentation from YAML file to the triplestore. Functions for searching the triplestore: - - `list_dataset_iris()`: Get IRIs of matching datasets. + - `search_iris()`: Get IRIs of matching entries in the triplestore. Functions for working with the dict-representation: - `read_datadoc()`: Read documentation from YAML file and return it as dict. - `save_dict()`: Save dict documentation to the triplestore. - `load_dict()`: Load dict documentation from the triplestore. + - `as_jsonld()`: Return the dict as JSON-LD (represented as a Python dict) Functions for interaction with OTEAPI: - `get_partial_pipeline()`: Returns a OTELib partial pipeline. @@ -28,7 +29,6 @@ """ -# pylint: enable=line-too-long # pylint: disable=invalid-name,redefined-builtin,import-outside-toplevel import functools import io @@ -41,7 +41,7 @@ import requests import yaml # type: ignore -from tripper import DCAT, EMMO, OTEIO, RDF, Triplestore +from tripper import DCAT, EMMO, OTEIO, OWL, RDF, Triplestore from tripper.utils import AttrDict, as_python if TYPE_CHECKING: # pragma: no cover @@ -90,6 +90,12 @@ "datadoc_label": "datasets", "@type": [DCAT.Dataset, EMMO.DataSet], }, + "entry": { + # General datacatalog entry that is not one of the above + # Ex: samples, instruments, models, people, projects, ... + "datadoc_label": "other_entries", # XXX better label? + "@type": OWL.NamedIndividual, + }, } @@ -120,14 +126,15 @@ def save_dict( Notes: The keys in `dct` and `kwargs` may be either properties defined in the - [JSON-LD context](https://raw.githubusercontent.com/EMMC-ASBL/oteapi-dlite/refs/heads/rdf-serialisation/oteapi_dlite/context/0.2/context.json) - or one of the following special keywords: + [JSON-LD context] or one of the following special keywords: - "@id": Dataset IRI. Must always be given. - "@type": IRI of the ontology class for this type of data. For datasets, it is typically used to refer to a specific subclass of `emmo:DataSet` that provides a semantic description of this dataset. + References: + [JSON-LD context]: https://raw.githubusercontent.com/EMMC-ASBL/oteapi-dlite/refs/heads/rdf-serialisation/oteapi_dlite/context/0.2/context.json """ if "@id" not in dct: raise ValueError("`dct` must have an '@id' key") @@ -136,7 +143,7 @@ def save_dict( if prefixes: all_prefixes.update(prefixes) - d = prepare(type=type, dct=dct, prefixes=all_prefixes, **kwargs) + d = as_jsonld(dct=dct, type=type, prefixes=all_prefixes, **kwargs) # Bind prefixes for prefix, ns in all_prefixes.items(): @@ -199,8 +206,7 @@ def save_extra_content(ts: Triplestore, dct: dict) -> None: except ( dlite.DLiteMissingInstanceError # pylint: disable=no-member ): - # __FIXME__: check session whether want to warn or re-reise - # in this case + # __FIXME__: check session whether to warn or re-reise warnings.warn(f"cannot load datamodel: {uri}") else: add_dataset(ts, dm) @@ -476,7 +482,7 @@ def save_datadoc( for spec in dicttypes.values(): label = spec["datadoc_label"] for dct in get(d, label): - dct = prepare(types[label], dct, prefixes=prefixes) + dct = as_jsonld(dct=dct, type=types[label], prefixes=prefixes) f = io.StringIO(json.dumps(dct)) with Triplestore(backend="rdflib") as ts2: ts2.parse(f, format="json-ld") @@ -505,52 +511,65 @@ def prepare_datadoc(datadoc: dict) -> dict: for type, spec in dicttypes.items(): label = spec["datadoc_label"] for i, dct in enumerate(get(d, label)): - d[label][i] = prepare(type, dct, prefixes=d.prefixes) + d[label][i] = as_jsonld(dct=dct, type=type, prefixes=d.prefixes) return d -def prepare( - type: str, dct: dict, prefixes: dict, _recur: bool = False, **kwargs +def as_jsonld( + dct: dict, + type: "Optional[str]" = "dataset", + prefixes: "Optional[dict]" = None, + _entryid: "Optional[str]" = None, + **kwargs, ) -> dict: - """Return an updated copy of dict `dct` with additional key-value - pairs needed for serialisation to RDF. + """Return an updated copy of dict `dct` as valid JSON-LD. Arguments: - type: Type of dict to prepare. Should be one of: "dataset", - "distribution", "parser" or "generator". dct: Dict to return an updated copy of. + type: Type of dict to prepare. Should either be one of the + pre-defined names: "dataset", "distribution", "accessService", + "parser" and "generator" or an IRI to a class in an ontology. + Defaults to "dataset". prefixes: Dict with prefixes in addition to those included in the JSON-LD context. Should map namespace prefixes to IRIs. - _recur: Whether this function is called recursively. Intended for - internal use. + _entryid: Id of base entry that is documented. Intended for + internal use only. kwargs: Additional keyword arguments to add to the returned dict. A leading underscore in a key will be translated to a - leading "@"-sign. For example, "@id=..." may be provided - as "_id=...". + leading "@"-sign. For example, "@id" or "@context" may be + provided as "_id" or "_context", respectively. + Returns: - An updated copy of `dct`. + An updated copy of `dct` as valid JSON-LD. """ # pylint: disable=too-many-branches - if type not in dicttypes: - raise ValueError( - f"`type` must be one of: {', '.join(dicttypes.keys())}. " - f"Got: '{type}'" - ) - spec = dicttypes[type] - d = AttrDict() - if not _recur: + if not _entryid: d["@context"] = CONTEXT_URL - add(d, "@type", spec["@type"]) # get type at top - d.update(dct) - add(d, "@type", spec["@type"]) # readd type if overwritten + + if type: + t = dicttypes[type]["@type"] if type in dicttypes else type + add(d, "@type", t) # get type at top + d.update(dct) + add(d, "@type", t) # readd type if overwritten + else: + d.update(dct) for k, v in kwargs.items(): key = f"@{k[1:]}" if re.match("^_([^_]|([^_].*[^_]))$", k) else k add(d, key, v) + if "@id" not in d and not _entryid: + raise ValueError("Missing '@id' in dict to document") + + if not _entryid: + _entryid = d["@id"] + + if "@type" not in d: + warnings.warn(f"Missing '@type' in dict to document: {_entryid}") + all_prefixes = get_prefixes() if prefixes: all_prefixes.update(prefixes) @@ -584,9 +603,11 @@ def prepare( if isinstance(e, str): v[i] = expand_iri(e, all_prefixes) elif isinstance(e, dict) and k in nested: - v[i] = prepare(k, e, prefixes=prefixes) + v[i] = as_jsonld( + e, k, _entryid=_entryid, prefixes=prefixes + ) elif isinstance(v, dict) and k in nested: - d[k] = prepare(k, v, prefixes=prefixes) + d[k] = as_jsonld(v, k, _entryid=_entryid, prefixes=prefixes) return d @@ -711,31 +732,42 @@ def get_partial_pipeline( return pipeline -def list_dataset_iris(ts: Triplestore, **kwargs): - """Return a list of IRIs for all datasets matching a set of criterias - specified by `kwargs`. +def search_iris(ts: Triplestore, type=DCAT.Dataset, **kwargs): + """Return a list of IRIs for all entries of the given type. + Additional matching criterias can be specified by `kwargs`. + Arguments: ts: Triplestore to search. + type: Search for entries that are individuals of the class with + this IRI. The default is `dcat:Dataset`. kwargs: Match criterias. Examples: List all dataset IRIs: - list_dataset_iris(ts) + search_iris(ts) List IRIs of all datasets with John Doe as `contactPoint`: - list_dataset_iris(ts, contactPoint="John Doe") + search_iris(ts, contactPoint="John Doe") + + List IRIs of all samples: - List IRIs of all datasets with John Doe as `contactPoint` AND that are + search_iris(ts, type=CHAMEO.Sample) + + List IRIs of all datasets with John Doe as `contactPoint` AND are measured on a given sample: - list_dataset_iris( + search_iris( ts, contactPoint="John Doe", fromSample=SAMPLE.batch2/sample3 ) """ crit = [] + + if type: + crit.append(f" ?iri rdf:type <{type}> .") + expanded = {v: k for k, v in get_shortnames().items()} for k, v in kwargs.items(): key = f"@{k[1:]}" if k.startswith("_") else k @@ -748,14 +780,12 @@ def list_dataset_iris(ts: Triplestore, **kwargs): ) else: value = v - crit.append(f" ?dataset <{predicate}> {value} .") + crit.append(f" ?iri <{predicate}> {value} .") criterias = "\n".join(crit) query = f""" PREFIX rdf: <{RDF}> - PREFIX dcat: <{DCAT}> - SELECT ?dataset + SELECT ?iri WHERE {{ - ?dataset rdf:type dcat:Dataset . {criterias} }} """ From 94fa59a0788f49a33964aad14d0008b91ea4cf18 Mon Sep 17 00:00:00 2001 From: Jesper Friis Date: Mon, 16 Dec 2024 00:18:00 +0100 Subject: [PATCH 02/13] Added new TableDoc class providing a table interface for data documentation. --- docs/api_reference/dataset/tabledoc.md | 3 + pyproject.toml | 7 ++- tests/dataset/dataset_paths.py | 12 ++++ tests/dataset/test_dataaccess.py | 16 ++---- tests/dataset/test_dataset.py | 47 +++++++++------ tests/dataset/test_tabledoc.py | 79 ++++++++++++++++++++++++++ tripper/dataset/__init__.py | 1 + tripper/dataset/dataaccess.py | 4 +- tripper/dataset/dataset.py | 79 +++++++++++++++++++++++--- tripper/dataset/tabledoc.py | 68 ++++++++++++++++++++++ 10 files changed, 275 insertions(+), 41 deletions(-) create mode 100644 docs/api_reference/dataset/tabledoc.md create mode 100644 tests/dataset/dataset_paths.py create mode 100644 tests/dataset/test_tabledoc.py create mode 100644 tripper/dataset/tabledoc.py diff --git a/docs/api_reference/dataset/tabledoc.md b/docs/api_reference/dataset/tabledoc.md new file mode 100644 index 00000000..f3a73929 --- /dev/null +++ b/docs/api_reference/dataset/tabledoc.md @@ -0,0 +1,3 @@ +# tabledoc + +::: tripper.dataset.tabledoc diff --git a/pyproject.toml b/pyproject.toml index d5f7f94a..21196860 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -104,7 +104,8 @@ max-public-methods = 25 max-locals = 20 disable = [ "fixme", - "too-many-positional-arguments", + "invalid-name", + #"too-many-positional-arguments", ] good-names = [ # Default @@ -115,8 +116,8 @@ good-names = [ "s", "p", "o", # Namespaces "EX", - # dict, value, file, ... - "d", "v", "f", + # dict, value, file, keyword... + "d", "v", "f", "kw", ] [tool.pytest.ini_options] diff --git a/tests/dataset/dataset_paths.py b/tests/dataset/dataset_paths.py new file mode 100644 index 00000000..e84b2f47 --- /dev/null +++ b/tests/dataset/dataset_paths.py @@ -0,0 +1,12 @@ +"""Defines paths for tests. + +It defines some directories and some utility functions that can be used +with or without conftest. +""" + +from pathlib import Path + +testdir = Path(__file__).resolve().parent.parent +ontodir = testdir / "ontologies" +indir = testdir / "input" +outdir = testdir / "output" diff --git a/tests/dataset/test_dataaccess.py b/tests/dataset/test_dataaccess.py index c3a7b75d..bdc0ef45 100644 --- a/tests/dataset/test_dataaccess.py +++ b/tests/dataset/test_dataaccess.py @@ -2,18 +2,12 @@ # pylint: disable=invalid-name,too-many-locals,duplicate-code -from pathlib import Path - import pytest +from dataset_paths import outdir pytest.importorskip("yaml") pytest.importorskip("requests") -thisdir = Path(__file__).resolve().parent -testdir = thisdir.parent -inputdir = testdir / "input" -outputdir = testdir / "output" - # if True: def test_save_and_load(): @@ -38,7 +32,6 @@ def test_save_and_load(): # Test save dict save_dict( ts, - type="dataset", dct={ "@id": SEMDATA.img1, "distribution": { @@ -49,6 +42,7 @@ def test_save_and_load(): "format": "tiff", }, }, + type="dataset", ) newdistr = load_dict(ts, SEMDATA.img1) assert newdistr["@type"] == [DCAT.Dataset, EMMO.DataSet] @@ -57,12 +51,12 @@ def test_save_and_load(): save_dict( ts, - type="generator", dct={ "@id": GEN.sem_hitachi, "generatorType": "application/vnd.dlite-generate", "configuration": {"driver": "hitachi"}, }, + type="generator", ) # Test load dataset (this downloads an actual image from github) @@ -70,7 +64,7 @@ def test_save_and_load(): assert len(data) == 53502 # Test save dataset with anonymous distribution - newfile = outputdir / "newimage.tiff" + newfile = outdir / "newimage.tiff" newfile.unlink(missing_ok=True) buf = b"some bytes..." save( @@ -94,7 +88,7 @@ def test_save_and_load(): assert newimage.distribution.downloadURL == f"file:{newfile}" # Test save dataset with named distribution - newfile2 = outputdir / "newimage.png" + newfile2 = outdir / "newimage.png" newfile2.unlink(missing_ok=True) save( ts, diff --git a/tests/dataset/test_dataset.py b/tests/dataset/test_dataset.py index 105db426..4e43cd10 100644 --- a/tests/dataset/test_dataset.py +++ b/tests/dataset/test_dataset.py @@ -2,18 +2,12 @@ # pylint: disable=invalid-name,too-many-locals,duplicate-code -from pathlib import Path - import pytest +from dataset_paths import indir pytest.importorskip("yaml") pytest.importorskip("requests") -thisdir = Path(__file__).resolve().parent -testdir = thisdir.parent -inputdir = testdir / "input" -outputdir = testdir / "output" - def test_get_jsonld_context(): """Test get_jsonld_context().""" @@ -73,12 +67,31 @@ def test_add(): from tripper.dataset.dataset import add d = {} - add(d, "a", 1) - add(d, "b", 1) - add(d, "b", 1) - add(d, "a", 2) - add(d, "a", 1) - assert d == {"a": [1, 2], "b": 1} + add(d, "a", "1") + add(d, "b", "1") + add(d, "b", "1") + add(d, "a", "2") + add(d, "a", "1") + add(d, "a", {"c": "3"}) + assert d == {"a": ["1", "2", {"c": "3"}], "b": "1"} + + +def test_addnested(): + """Test help-function addnested().""" + from tripper.dataset.dataset import addnested + from tripper.utils import AttrDict + + d = AttrDict() + addnested(d, "a.b", "1") + assert d == {"a": {"b": "1"}} + + addnested(d, "a", "2") + assert d == {"a": ["2", {"b": "1"}]} + + addnested(d, "a.b.c", {"d": "3"}) + assert d.a[0] == "2" + assert d.a[1].b[1].c == {"d": "3"} + assert d == {"a": ["2", {"b": ["1", {"c": {"d": "3"}}]}]} def test_get(): @@ -124,7 +137,7 @@ def test_datadoc(): ts = Triplestore("rdflib") # Load data documentation into triplestore - datadoc = save_datadoc(ts, inputdir / "semdata.yaml") + datadoc = save_datadoc(ts, indir / "semdata.yaml") assert isinstance(datadoc, dict) assert "@context" in datadoc @@ -167,8 +180,8 @@ def test_datadoc(): # Test save dict save_dict( ts, - "distribution", - {"@id": SEMDATA.newdistr, "format": "txt"}, + dct={"@id": SEMDATA.newdistr, "format": "txt"}, + type="distribution", prefixes={"echem": "https://w3id.org/emmo/domain/electrochemistry"}, ) newdistr = load_dict(ts, SEMDATA.newdistr) @@ -210,7 +223,7 @@ def test_pipeline(): # Prepare triplestore ts = Triplestore("rdflib") - save_datadoc(ts, inputdir / "semdata.yaml") + save_datadoc(ts, indir / "semdata.yaml") SEMDATA = ts.namespaces["semdata"] diff --git a/tests/dataset/test_tabledoc.py b/tests/dataset/test_tabledoc.py new file mode 100644 index 00000000..77e81dcc --- /dev/null +++ b/tests/dataset/test_tabledoc.py @@ -0,0 +1,79 @@ +"""Test the dataset module.""" + +from tripper import Triplestore +from tripper.dataset import TableDoc + + +# if True: +def test_as_dicts(): + """Test the as_dicts() method.""" + + from tripper import DCAT, EMMO, Namespace + + ONTO = Namespace("http:/example.com/onto#") + DS = Namespace("http:/example.com/datasets#") + + td = TableDoc( + header=[ + "@id", + "@type", + "@type", + "inSeries", + "distribution.downloadURL", + ], + data=[ + ("ds:s1", "onto:T1", "onto:T2", None, "file:///data/"), + ("ds:d1", "onto:T1", None, "ds:s1", "file:///data/d1.txt"), + ("ds:d2", "onto:T2", None, "ds:s1", "file:///data/d2.txt"), + ], + prefixes={ + "onto": "http:/example.com/onto#", + "ds": "http:/example.com/datasets#", + }, + # context={ + # "ds": "http:/example.com/datasets#", + # }, + ) + + s1, d1, d2 = td.asdicts() # pylint: disable=unbalanced-tuple-unpacking + + assert s1["@id"] == DS.s1 + assert set(s1["@type"]) == { + DCAT.Dataset, + EMMO.DataSet, + ONTO.T1, + ONTO.T2, + } + assert "inSeries" not in s1 + assert s1.distribution == { + "@type": DCAT.Distribution, + "downloadURL": "file:///data/", + } + + assert d1["@id"] == DS.d1 + assert set(d1["@type"]) == { + DCAT.Dataset, + EMMO.DataSet, + ONTO.T1, + } + assert d1.inSeries == DS.s1 + assert d1.distribution == { + "@type": DCAT.Distribution, + "downloadURL": "file:///data/d1.txt", + } + + assert d2["@id"] == DS.d2 + assert set(d2["@type"]) == { + DCAT.Dataset, + EMMO.DataSet, + ONTO.T2, + } + assert d2.inSeries == DS.s1 + assert d2.distribution == { + "@type": DCAT.Distribution, + "downloadURL": "file:///data/d2.txt", + } + + ts = Triplestore(backend="rdflib") + td.save(ts) + print(ts.serialize()) diff --git a/tripper/dataset/__init__.py b/tripper/dataset/__init__.py index d6435b8d..e0b53d58 100644 --- a/tripper/dataset/__init__.py +++ b/tripper/dataset/__init__.py @@ -12,3 +12,4 @@ save_dict, search_iris, ) +from .tabledoc import TableDoc diff --git a/tripper/dataset/dataaccess.py b/tripper/dataset/dataaccess.py index 672b2a59..3e248e36 100644 --- a/tripper/dataset/dataaccess.py +++ b/tripper/dataset/dataaccess.py @@ -175,9 +175,9 @@ def save( # Update triplestore ts.add_triples(triples) if save_dataset: - save_dict(ts, "dataset", dataset, prefixes=prefixes) + save_dict(ts, dataset, "dataset", prefixes=prefixes) elif save_distribution: - save_dict(ts, "distribution", distribution, prefixes=prefixes) + save_dict(ts, distribution, "distribution", prefixes=prefixes) return dataset["@id"] diff --git a/tripper/dataset/dataset.py b/tripper/dataset/dataset.py index ba4a0fde..2bb5e6a1 100644 --- a/tripper/dataset/dataset.py +++ b/tripper/dataset/dataset.py @@ -101,8 +101,8 @@ def save_dict( ts: Triplestore, - type: str, dct: dict, + type: str = "dataset", prefixes: "Optional[dict]" = None, **kwargs, ) -> dict: @@ -111,9 +111,11 @@ def save_dict( Arguments: ts: Triplestore to save to. - type: Type of dict to save. Should be one of: "dataset", - "distribution", "parser" or "generator". dct: Dict with data to save. + type: Type of data to save. Should either be one of the + pre-defined names: "dataset", "distribution", "accessService", + "parser" and "generator" or an IRI to a class in an ontology. + Defaults to "dataset". prefixes: Dict with prefixes in addition to those included in the JSON-LD context. Should map namespace prefixes to IRIs. kwargs: Additional keyword arguments to add to the returned dict. @@ -333,6 +335,9 @@ def get_values( return values +# TODO: update this function to take an initial argument `context`, +# which can be an URL (string), dict with raw context or a list of +# strings or dicts. @cache # type: ignore def get_jsonld_context(timeout: float = 5, fromfile: bool = True) -> dict: """Returns the JSON-LD context as a dict. @@ -355,6 +360,8 @@ def get_jsonld_context(timeout: float = 5, fromfile: bool = True) -> dict: return context +# TODO: update this to take an initial argument `context`. +# See get_jsonld_context() def get_prefixes(timeout: float = 5) -> dict: """Loads the JSON-LD context and returns a dict mapping prefixes to their namespace URL.""" @@ -367,6 +374,8 @@ def get_prefixes(timeout: float = 5) -> dict: return prefixes +# TODO: update this to take an initial argument `context`. +# See get_jsonld_context() def get_shortnames(timeout: float = 5) -> dict: """Loads the JSON-LD context and returns a dict mapping IRIs to their short names defined in the context.""" @@ -407,9 +416,61 @@ def add(d: dict, key: str, value: "Any") -> None: d[key] = value else: klst = d[key] if isinstance(d[key], list) else [d[key]] - vlst = value if isinstance(value, list) else [value] - v = list(set(klst).union(vlst)) - d[key] = v[0] if len(v) == 1 else sorted(v) + if isinstance(value, dict): + v = klst if value in klst else klst + [value] + else: + vlst = value if isinstance(value, list) else [value] + try: + v = list(set(klst).union(vlst)) + except TypeError: # klst contains unhashable dicts + v = klst + [x for x in vlst if x not in klst] + d[key] = ( + v[0] + if len(v) == 1 + else sorted( + # Sort dicts at end, by representing them with a huge + # unicode character + v, + key=lambda x: "\uffff" if isinstance(x, dict) else x, + ) + ) + + +def addnested(d: "Union[dict, list]", key: str, value: "Any"): + """Like add(), but allows `key` to be a dot-separated list of sub-keys. + + Each sub-key will be added to `d` as a corresponding sub-dict. + + Example: + + >>> d = {} + >>> addnested(d, "a.b.c", "val") + {'a': {'b': {'c': 'val'}}} + + """ + if "." in key: + first, rest = key.split(".", 1) + if isinstance(d, list): + for ele in d: + if isinstance(ele, dict): + addnested(ele, key, value) + break + else: + d.append(addnested({}, key, value)) + elif first in d and isinstance(d[first], (dict, list)): + addnested(d[first], rest, value) + else: + addnested(d, first, addnested(AttrDict(), rest, value)) + elif isinstance(d, list): + for ele in d: + if isinstance(ele, dict): + add(ele, key, value) + break + else: + d.append({key: value}) + else: + add(d, key, value) + return d def get( @@ -516,6 +577,8 @@ def prepare_datadoc(datadoc: dict) -> dict: return d +# TODO: update this function to correctly handle multiple contexts +# provided with the `_context` keyword argument. def as_jsonld( dct: dict, type: "Optional[str]" = "dataset", @@ -526,8 +589,8 @@ def as_jsonld( """Return an updated copy of dict `dct` as valid JSON-LD. Arguments: - dct: Dict to return an updated copy of. - type: Type of dict to prepare. Should either be one of the + dct: Dict with data documentation represent as JSON-LD. + type: Type of data to document. Should either be one of the pre-defined names: "dataset", "distribution", "accessService", "parser" and "generator" or an IRI to a class in an ontology. Defaults to "dataset". diff --git a/tripper/dataset/tabledoc.py b/tripper/dataset/tabledoc.py new file mode 100644 index 00000000..b1a8ef51 --- /dev/null +++ b/tripper/dataset/tabledoc.py @@ -0,0 +1,68 @@ +"""Basic interface for tabular documentation of datasets.""" + +from typing import TYPE_CHECKING + +from tripper import Triplestore +from tripper.dataset.dataset import addnested, as_jsonld, save_dict +from tripper.utils import AttrDict + +if TYPE_CHECKING: # pragma: no cover + from typing import List, Optional, Sequence, Union + + +class TableDoc: + """Representation of tabular documentation of datasets. + + Arguments: + header: Sequence of column header labels. Nested data can + be represented by dot-separated label strings (e.g. + "distribution.downloadURL") + data: Sequence of rows of data. Each row documents an entry. + type: Type of data to save (applies to all rows). Should + either be one of the pre-defined names: "dataset", + "distribution", "accessService", "parser" and "generator" + or an IRI to a class in an ontology. Defaults to + "dataset". + prefixes: Dict with prefixes in addition to those included in the + JSON-LD context. Should map namespace prefixes to IRIs. + context: Dict with user-defined JSON-LD context. + + """ + + # pylint: disable=redefined-builtin,too-few-public-methods + + def __init__( + self, + header: "Sequence[str]", + data: "Sequence[Sequence[str]]", + type: "Optional[str]" = "dataset", + prefixes: "Optional[dict]" = None, + context: "Optional[Union[dict, list]]" = None, + ): + self.header = header + self.data = data + self.type = type + self.prefixes = prefixes + self.context = context + + def asdicts(self) -> "List[dict]": + """Return the table as a list of dicts.""" + kw = {"_context": self.context} if self.context else {} + + results = [] + for row in self.data: + d = AttrDict() + for i, colname in enumerate(self.header): + cell = row[i] + if cell: + addnested(d, colname, cell) + jsonld = as_jsonld( + d, type=self.type, prefixes=self.prefixes, **kw # type: ignore + ) + results.append(jsonld) + return results + + def save(self, ts: Triplestore) -> None: + """Save tabular datadocumentation to triplestore.""" + for d in self.asdicts(): + save_dict(ts, d) From 028054fad4ae6a7e0055ad746b90f200c807e965 Mon Sep 17 00:00:00 2001 From: Jesper Friis Date: Mon, 16 Dec 2024 00:39:21 +0100 Subject: [PATCH 03/13] Import indir/outdir inside test functions --- tests/dataset/test_dataaccess.py | 3 ++- tests/dataset/test_dataset.py | 5 ++++- tripper/dataset/dataset.py | 3 ++- 3 files changed, 8 insertions(+), 3 deletions(-) diff --git a/tests/dataset/test_dataaccess.py b/tests/dataset/test_dataaccess.py index bdc0ef45..0cbc7727 100644 --- a/tests/dataset/test_dataaccess.py +++ b/tests/dataset/test_dataaccess.py @@ -3,7 +3,6 @@ # pylint: disable=invalid-name,too-many-locals,duplicate-code import pytest -from dataset_paths import outdir pytest.importorskip("yaml") pytest.importorskip("requests") @@ -14,6 +13,8 @@ def test_save_and_load(): """Test save() and load().""" # pylint: disable=too-many-statements + from dataset_paths import outdir + from tripper import DCAT, DCTERMS, EMMO, Triplestore from tripper.dataset import load, load_dict, save, save_dict diff --git a/tests/dataset/test_dataset.py b/tests/dataset/test_dataset.py index 4e43cd10..1a0cffbd 100644 --- a/tests/dataset/test_dataset.py +++ b/tests/dataset/test_dataset.py @@ -3,7 +3,6 @@ # pylint: disable=invalid-name,too-many-locals,duplicate-code import pytest -from dataset_paths import indir pytest.importorskip("yaml") pytest.importorskip("requests") @@ -128,6 +127,8 @@ def test_datadoc(): """Test save_datadoc() and load_dict()/save_dict().""" # pylint: disable=too-many-statements + from dataset_paths import indir + from tripper import CHAMEO, DCAT, EMMO, OTEIO, Triplestore from tripper.dataset import load_dict, save_datadoc, save_dict, search_iris @@ -219,6 +220,8 @@ def test_pipeline(): from tripper import Triplestore otelib = pytest.importorskip("otelib") + from dataset_paths import indir + from tripper.dataset import get_partial_pipeline, save_datadoc # Prepare triplestore diff --git a/tripper/dataset/dataset.py b/tripper/dataset/dataset.py index 2bb5e6a1..13fdb935 100644 --- a/tripper/dataset/dataset.py +++ b/tripper/dataset/dataset.py @@ -445,7 +445,8 @@ def addnested(d: "Union[dict, list]", key: str, value: "Any"): >>> d = {} >>> addnested(d, "a.b.c", "val") - {'a': {'b': {'c': 'val'}}} + >>> d == {'a': {'b': {'c': 'val'}}} + True """ if "." in key: From ef5239ad7e96f586c5ce970cec834cf7881864ff Mon Sep 17 00:00:00 2001 From: Jesper Friis Date: Mon, 16 Dec 2024 00:56:02 +0100 Subject: [PATCH 04/13] Fixed doctest issue --- tripper/dataset/dataset.py | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/tripper/dataset/dataset.py b/tripper/dataset/dataset.py index 13fdb935..8db5b497 100644 --- a/tripper/dataset/dataset.py +++ b/tripper/dataset/dataset.py @@ -38,9 +38,6 @@ from pathlib import Path from typing import TYPE_CHECKING -import requests -import yaml # type: ignore - from tripper import DCAT, EMMO, OTEIO, OWL, RDF, Triplestore from tripper.utils import AttrDict, as_python @@ -171,6 +168,8 @@ def save_extra_content(ts: Triplestore, dct: dict) -> None: - data models (require that DLite is installed) """ + import requests + # Save statements and mappings statements = get_values(dct, "statements") statements.extend(get_values(dct, "mappings")) @@ -351,6 +350,8 @@ def get_jsonld_context(timeout: float = 5, fromfile: bool = True) -> dict: fromfile: Whether to load the context from local file. """ + import requests + if fromfile: with open(CONTEXT_PATH[7:], "r", encoding="utf-8") as f: context = json.load(f)["@context"] @@ -436,16 +437,18 @@ def add(d: dict, key: str, value: "Any") -> None: ) -def addnested(d: "Union[dict, list]", key: str, value: "Any"): +def addnested( + d: "Union[dict, list]", key: str, value: "Any" +) -> "Union[dict, list]": """Like add(), but allows `key` to be a dot-separated list of sub-keys. + Returns the updated `d`. Each sub-key will be added to `d` as a corresponding sub-dict. Example: >>> d = {} - >>> addnested(d, "a.b.c", "val") - >>> d == {'a': {'b': {'c': 'val'}}} + >>> addnested(d, "a.b.c", "val") == {'a': {'b': {'c': 'val'}}} True """ @@ -508,6 +511,8 @@ def expand_iri(iri: str, prefixes: dict) -> str: def read_datadoc(filename: "Union[str, Path]") -> dict: """Read YAML data documentation and return it as a dict.""" + import yaml # type: ignore + with open(filename, "r", encoding="utf-8") as f: d = yaml.safe_load(f) return prepare_datadoc(d) From 331878a1756ed225d48fc3a2e0acdd567ccc4774 Mon Sep 17 00:00:00 2001 From: Jesper Friis Date: Mon, 16 Dec 2024 01:01:22 +0100 Subject: [PATCH 05/13] Skip test_tabledoc if rdflib isn't available --- tests/dataset/test_tabledoc.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tests/dataset/test_tabledoc.py b/tests/dataset/test_tabledoc.py index 77e81dcc..49902b69 100644 --- a/tests/dataset/test_tabledoc.py +++ b/tests/dataset/test_tabledoc.py @@ -1,5 +1,7 @@ """Test the dataset module.""" +import pytest + from tripper import Triplestore from tripper.dataset import TableDoc @@ -10,6 +12,8 @@ def test_as_dicts(): from tripper import DCAT, EMMO, Namespace + pytest.importorskip("rdflib") + ONTO = Namespace("http:/example.com/onto#") DS = Namespace("http:/example.com/datasets#") From 5fe9cf7c387a9fb3ccd02856c0878ef8dceb8eba Mon Sep 17 00:00:00 2001 From: Jesper Friis Date: Mon, 16 Dec 2024 01:04:02 +0100 Subject: [PATCH 06/13] More pylint fixes... --- tests/dataset/test_dataaccess.py | 2 +- tests/dataset/test_dataset.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/dataset/test_dataaccess.py b/tests/dataset/test_dataaccess.py index 0cbc7727..ecf98dba 100644 --- a/tests/dataset/test_dataaccess.py +++ b/tests/dataset/test_dataaccess.py @@ -13,7 +13,7 @@ def test_save_and_load(): """Test save() and load().""" # pylint: disable=too-many-statements - from dataset_paths import outdir + from dataset_paths import outdir # pytest: disable=import-error from tripper import DCAT, DCTERMS, EMMO, Triplestore from tripper.dataset import load, load_dict, save, save_dict diff --git a/tests/dataset/test_dataset.py b/tests/dataset/test_dataset.py index 1a0cffbd..4fb5ec09 100644 --- a/tests/dataset/test_dataset.py +++ b/tests/dataset/test_dataset.py @@ -127,7 +127,7 @@ def test_datadoc(): """Test save_datadoc() and load_dict()/save_dict().""" # pylint: disable=too-many-statements - from dataset_paths import indir + from dataset_paths import indir # pytest: disable=import-error from tripper import CHAMEO, DCAT, EMMO, OTEIO, Triplestore from tripper.dataset import load_dict, save_datadoc, save_dict, search_iris @@ -220,7 +220,7 @@ def test_pipeline(): from tripper import Triplestore otelib = pytest.importorskip("otelib") - from dataset_paths import indir + from dataset_paths import indir # pytest: disable=import-error from tripper.dataset import get_partial_pipeline, save_datadoc From 4aaeed8551b56ece4c7c9af151ce3152a2779077 Mon Sep 17 00:00:00 2001 From: Jesper Friis Date: Mon, 16 Dec 2024 01:08:20 +0100 Subject: [PATCH 07/13] Placed importskip before importing EMMO --- tests/dataset/test_tabledoc.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/tests/dataset/test_tabledoc.py b/tests/dataset/test_tabledoc.py index 49902b69..4a1c0613 100644 --- a/tests/dataset/test_tabledoc.py +++ b/tests/dataset/test_tabledoc.py @@ -2,18 +2,16 @@ import pytest -from tripper import Triplestore -from tripper.dataset import TableDoc - # if True: def test_as_dicts(): """Test the as_dicts() method.""" - from tripper import DCAT, EMMO, Namespace - pytest.importorskip("rdflib") + from tripper import DCAT, EMMO, Namespace, Triplestore + from tripper.dataset import TableDoc + ONTO = Namespace("http:/example.com/onto#") DS = Namespace("http:/example.com/datasets#") From 0f21fbbde6f6b6eaaf6583145f403fb7f841c0a7 Mon Sep 17 00:00:00 2001 From: Jesper Friis Date: Mon, 16 Dec 2024 01:12:12 +0100 Subject: [PATCH 08/13] typo --- tests/dataset/test_dataaccess.py | 2 +- tests/dataset/test_dataset.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/dataset/test_dataaccess.py b/tests/dataset/test_dataaccess.py index ecf98dba..af058440 100644 --- a/tests/dataset/test_dataaccess.py +++ b/tests/dataset/test_dataaccess.py @@ -13,7 +13,7 @@ def test_save_and_load(): """Test save() and load().""" # pylint: disable=too-many-statements - from dataset_paths import outdir # pytest: disable=import-error + from dataset_paths import outdir # pylint: disable=import-error from tripper import DCAT, DCTERMS, EMMO, Triplestore from tripper.dataset import load, load_dict, save, save_dict diff --git a/tests/dataset/test_dataset.py b/tests/dataset/test_dataset.py index 4fb5ec09..9bdec2c6 100644 --- a/tests/dataset/test_dataset.py +++ b/tests/dataset/test_dataset.py @@ -127,7 +127,7 @@ def test_datadoc(): """Test save_datadoc() and load_dict()/save_dict().""" # pylint: disable=too-many-statements - from dataset_paths import indir # pytest: disable=import-error + from dataset_paths import indir # pylint: disable=import-error from tripper import CHAMEO, DCAT, EMMO, OTEIO, Triplestore from tripper.dataset import load_dict, save_datadoc, save_dict, search_iris @@ -220,7 +220,7 @@ def test_pipeline(): from tripper import Triplestore otelib = pytest.importorskip("otelib") - from dataset_paths import indir # pytest: disable=import-error + from dataset_paths import indir # pylint: disable=import-error from tripper.dataset import get_partial_pipeline, save_datadoc From 4cc88cb0fb119697f35184177e9dbfb697141ac4 Mon Sep 17 00:00:00 2001 From: Jesper Friis Date: Mon, 16 Dec 2024 15:53:34 +0100 Subject: [PATCH 09/13] Fixed pylint errors --- tripper/dataset/dataset.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/tripper/dataset/dataset.py b/tripper/dataset/dataset.py index b98eaeaa..bbb4a178 100644 --- a/tripper/dataset/dataset.py +++ b/tripper/dataset/dataset.py @@ -42,9 +42,6 @@ from pathlib import Path from typing import TYPE_CHECKING -import requests -import yaml # type: ignore - from tripper import DCAT, EMMO, OTEIO, OWL, RDF, Triplestore from tripper.utils import AttrDict, as_python From 92b213d7b2a292b04ddbaf0921d4e046a27e95db Mon Sep 17 00:00:00 2001 From: Jesper Friis Date: Thu, 19 Dec 2024 10:10:35 +0100 Subject: [PATCH 10/13] added csv file --- tests/input/semdata.csv | 5 +++++ 1 file changed, 5 insertions(+) create mode 100644 tests/input/semdata.csv diff --git a/tests/input/semdata.csv b/tests/input/semdata.csv new file mode 100644 index 00000000..631d9e69 --- /dev/null +++ b/tests/input/semdata.csv @@ -0,0 +1,5 @@ +@id;@type;title;description;creator;contactPoint;inSeries;datamodel;datamodelStorage;distribution.downloadURL;distribution.mediaType;distribution.parser;fromSample;isDescriptionOf +semdata:SEM_cement_batch2/77600-23-001/77600-23-001_5kV_400x_m001;sem:SEMImage;SEM image of cement;Back-scattered SEM image of cement sample 77600 from Heidelberg, polished with 1 µm diamond compound.;Sigurd Wenner;Sigurd Wenner ;semdata:SEM_cement_batch2/77600-23-001;http://onto-ns.com/meta/matchmaker/0.2/SEMImage;https://github.com/HEU-MatCHMaker/DataDocumentation/blob/master/SEM/datamodels/SEMImage.yaml;https://github.com/EMMC-ASBL/tripper/raw/refs/heads/dataset/tests/input/77600-23-001_5kV_400x_m001.tif;image/tiff;parser:sem_hitachi;sample:SEM_cement_batch2/77600-23-001;mat:concrete1 +semdata:SEM_cement_batch2/77600-23-001;sem:SEMImageSeries;Series of SEM image of cement sample 77600;Back-scattered SEM image of cement sample 77600, polished with 1 µm diamond compound.;Sigurd Wenner;Sigurd Wenner ;semdata:SEM_cement_batch2; ;;sftp://nas.aimen.es/P_MATCHMAKER_SHARE_SINTEF/SEM_cement_batch2/77600-23-001;inode/directory;;; +semdata:SEM_cement_batch2;sem:SEMImageSeries;Nested series of SEM images of cement batch2;…;Sigurd Wenner;Sigurd Wenner ; ;;;sftp://nas.aimen.es/P_MATCHMAKER_SHARE_SINTEF/SEM_cement_batch2;inode/directory;;; +mple:SEM_cement_batch2/77600-23-001;chameo:Sample;Series for SEM images for sample 77600-23-001.; ;;;;;;;;;; From 66b9dd75d0291359f9507033b9a8159cf0a8d320 Mon Sep 17 00:00:00 2001 From: Jesper Friis Date: Mon, 30 Dec 2024 12:23:13 +0100 Subject: [PATCH 11/13] Apply suggestions from code review Co-authored-by: Tor S. Haugland --- pyproject.toml | 1 - tests/dataset/test_tabledoc.py | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 21196860..0398f0a0 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -105,7 +105,6 @@ max-locals = 20 disable = [ "fixme", "invalid-name", - #"too-many-positional-arguments", ] good-names = [ # Default diff --git a/tests/dataset/test_tabledoc.py b/tests/dataset/test_tabledoc.py index 4a1c0613..da74203c 100644 --- a/tests/dataset/test_tabledoc.py +++ b/tests/dataset/test_tabledoc.py @@ -1,4 +1,4 @@ -"""Test the dataset module.""" +"""Test the TableDoc class.""" import pytest From 575f09d2b13deb60ce8ae9addeb734650223ca31 Mon Sep 17 00:00:00 2001 From: Jesper Friis Date: Mon, 30 Dec 2024 12:30:22 +0100 Subject: [PATCH 12/13] Apply suggestions from code review Co-authored-by: Tor S. Haugland --- tripper/dataset/tabledoc.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tripper/dataset/tabledoc.py b/tripper/dataset/tabledoc.py index b1a8ef51..9fd5d988 100644 --- a/tripper/dataset/tabledoc.py +++ b/tripper/dataset/tabledoc.py @@ -47,7 +47,7 @@ def __init__( def asdicts(self) -> "List[dict]": """Return the table as a list of dicts.""" - kw = {"_context": self.context} if self.context else {} + kw = {"@context": self.context} if self.context else {} results = [] for row in self.data: From f45376db770b151d6ea7eb4de901fb646ef3dc43 Mon Sep 17 00:00:00 2001 From: Jesper Friis Date: Mon, 30 Dec 2024 12:49:06 +0100 Subject: [PATCH 13/13] Added a clarifying comment as a responce to review comment by @torhaugl. --- tests/dataset/test_tabledoc.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/dataset/test_tabledoc.py b/tests/dataset/test_tabledoc.py index da74203c..278e7881 100644 --- a/tests/dataset/test_tabledoc.py +++ b/tests/dataset/test_tabledoc.py @@ -32,6 +32,8 @@ def test_as_dicts(): "onto": "http:/example.com/onto#", "ds": "http:/example.com/datasets#", }, + # Replace the "ds" prefix above with this, once the "context" keyword + # argument is fully implemented. # context={ # "ds": "http:/example.com/datasets#", # },