Skip to content

Commit

Permalink
New TableDoc class providing a table interface for data documentation (
Browse files Browse the repository at this point in the history
…#273)

# Description
Added the TableDoc class providing a table interface for data documentation.

---------

Co-authored-by: Tor S. Haugland <torshaugland@gmail.com>
  • Loading branch information
jesper-friis and torhaugl authored Jan 3, 2025
1 parent f040624 commit c84bae1
Show file tree
Hide file tree
Showing 11 changed files with 296 additions and 44 deletions.
3 changes: 3 additions & 0 deletions docs/api_reference/dataset/tabledoc.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
# tabledoc

::: tripper.dataset.tabledoc
6 changes: 3 additions & 3 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -104,7 +104,7 @@ max-public-methods = 25
max-locals = 20
disable = [
"fixme",
"too-many-positional-arguments",
"invalid-name",
]
good-names = [
# Default
Expand All @@ -115,8 +115,8 @@ good-names = [
"s", "p", "o",
# Namespaces
"EX",
# dict, value, file, ...
"d", "v", "f",
# dict, value, file, keyword...
"d", "v", "f", "kw",
]

[tool.pytest.ini_options]
Expand Down
12 changes: 12 additions & 0 deletions tests/dataset/dataset_paths.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
"""Defines paths for tests.
It defines some directories and some utility functions that can be used
with or without conftest.
"""

from pathlib import Path

testdir = Path(__file__).resolve().parent.parent
ontodir = testdir / "ontologies"
indir = testdir / "input"
outdir = testdir / "output"
17 changes: 6 additions & 11 deletions tests/dataset/test_dataaccess.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,24 +2,19 @@

# pylint: disable=invalid-name,too-many-locals,duplicate-code

from pathlib import Path

import pytest

pytest.importorskip("yaml")
pytest.importorskip("requests")

thisdir = Path(__file__).resolve().parent
testdir = thisdir.parent
inputdir = testdir / "input"
outputdir = testdir / "output"


# if True:
def test_save_and_load():
"""Test save() and load()."""
# pylint: disable=too-many-statements

from dataset_paths import outdir # pylint: disable=import-error

from tripper import DCAT, DCTERMS, EMMO, Triplestore
from tripper.dataset import load, load_dict, save, save_dict

Expand All @@ -38,7 +33,6 @@ def test_save_and_load():
# Test save dict
save_dict(
ts,
type="dataset",
dct={
"@id": SEMDATA.img1,
"distribution": {
Expand All @@ -49,6 +43,7 @@ def test_save_and_load():
"format": "tiff",
},
},
type="dataset",
)
newdistr = load_dict(ts, SEMDATA.img1)
assert newdistr["@type"] == [DCAT.Dataset, EMMO.DataSet]
Expand All @@ -57,20 +52,20 @@ def test_save_and_load():

save_dict(
ts,
type="generator",
dct={
"@id": GEN.sem_hitachi,
"generatorType": "application/vnd.dlite-generate",
"configuration": {"driver": "hitachi"},
},
type="generator",
)

# Test load dataset (this downloads an actual image from github)
data = load(ts, iri)
assert len(data) == 53502

# Test save dataset with anonymous distribution
newfile = outputdir / "newimage.tiff"
newfile = outdir / "newimage.tiff"
newfile.unlink(missing_ok=True)
buf = b"some bytes..."
save(
Expand All @@ -94,7 +89,7 @@ def test_save_and_load():
assert newimage.distribution.downloadURL == f"file:{newfile}"

# Test save dataset with named distribution
newfile2 = outputdir / "newimage.png"
newfile2 = outdir / "newimage.png"
newfile2.unlink(missing_ok=True)
save(
ts,
Expand Down
50 changes: 33 additions & 17 deletions tests/dataset/test_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,18 +2,11 @@

# pylint: disable=invalid-name,too-many-locals,duplicate-code

from pathlib import Path

import pytest

pytest.importorskip("yaml")
pytest.importorskip("requests")

thisdir = Path(__file__).resolve().parent
testdir = thisdir.parent
inputdir = testdir / "input"
outputdir = testdir / "output"


def test_get_jsonld_context():
"""Test get_jsonld_context()."""
Expand Down Expand Up @@ -73,12 +66,31 @@ def test_add():
from tripper.dataset.dataset import add

d = {}
add(d, "a", 1)
add(d, "b", 1)
add(d, "b", 1)
add(d, "a", 2)
add(d, "a", 1)
assert d == {"a": [1, 2], "b": 1}
add(d, "a", "1")
add(d, "b", "1")
add(d, "b", "1")
add(d, "a", "2")
add(d, "a", "1")
add(d, "a", {"c": "3"})
assert d == {"a": ["1", "2", {"c": "3"}], "b": "1"}


def test_addnested():
"""Test help-function addnested()."""
from tripper.dataset.dataset import addnested
from tripper.utils import AttrDict

d = AttrDict()
addnested(d, "a.b", "1")
assert d == {"a": {"b": "1"}}

addnested(d, "a", "2")
assert d == {"a": ["2", {"b": "1"}]}

addnested(d, "a.b.c", {"d": "3"})
assert d.a[0] == "2"
assert d.a[1].b[1].c == {"d": "3"}
assert d == {"a": ["2", {"b": ["1", {"c": {"d": "3"}}]}]}


def test_get():
Expand Down Expand Up @@ -115,6 +127,8 @@ def test_datadoc():
"""Test save_datadoc() and load_dict()/save_dict()."""
# pylint: disable=too-many-statements

from dataset_paths import indir # pylint: disable=import-error

from tripper import CHAMEO, DCAT, EMMO, OTEIO, Triplestore
from tripper.dataset import load_dict, save_datadoc, save_dict, search_iris

Expand All @@ -124,7 +138,7 @@ def test_datadoc():
ts = Triplestore("rdflib")

# Load data documentation into triplestore
datadoc = save_datadoc(ts, inputdir / "semdata.yaml")
datadoc = save_datadoc(ts, indir / "semdata.yaml")
assert isinstance(datadoc, dict)
assert "@context" in datadoc

Expand Down Expand Up @@ -167,8 +181,8 @@ def test_datadoc():
# Test save dict
save_dict(
ts,
"distribution",
{"@id": SEMDATA.newdistr, "format": "txt"},
dct={"@id": SEMDATA.newdistr, "format": "txt"},
type="distribution",
prefixes={"echem": "https://w3id.org/emmo/domain/electrochemistry"},
)
newdistr = load_dict(ts, SEMDATA.newdistr)
Expand Down Expand Up @@ -206,11 +220,13 @@ def test_pipeline():
from tripper import Triplestore

otelib = pytest.importorskip("otelib")
from dataset_paths import indir # pylint: disable=import-error

from tripper.dataset import get_partial_pipeline, save_datadoc

# Prepare triplestore
ts = Triplestore("rdflib")
save_datadoc(ts, inputdir / "semdata.yaml")
save_datadoc(ts, indir / "semdata.yaml")

SEMDATA = ts.namespaces["semdata"]

Expand Down
83 changes: 83 additions & 0 deletions tests/dataset/test_tabledoc.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
"""Test the TableDoc class."""

import pytest


# if True:
def test_as_dicts():
"""Test the as_dicts() method."""

pytest.importorskip("rdflib")

from tripper import DCAT, EMMO, Namespace, Triplestore
from tripper.dataset import TableDoc

ONTO = Namespace("http:/example.com/onto#")
DS = Namespace("http:/example.com/datasets#")

td = TableDoc(
header=[
"@id",
"@type",
"@type",
"inSeries",
"distribution.downloadURL",
],
data=[
("ds:s1", "onto:T1", "onto:T2", None, "file:///data/"),
("ds:d1", "onto:T1", None, "ds:s1", "file:///data/d1.txt"),
("ds:d2", "onto:T2", None, "ds:s1", "file:///data/d2.txt"),
],
prefixes={
"onto": "http:/example.com/onto#",
"ds": "http:/example.com/datasets#",
},
# Replace the "ds" prefix above with this, once the "context" keyword
# argument is fully implemented.
# context={
# "ds": "http:/example.com/datasets#",
# },
)

s1, d1, d2 = td.asdicts() # pylint: disable=unbalanced-tuple-unpacking

assert s1["@id"] == DS.s1
assert set(s1["@type"]) == {
DCAT.Dataset,
EMMO.DataSet,
ONTO.T1,
ONTO.T2,
}
assert "inSeries" not in s1
assert s1.distribution == {
"@type": DCAT.Distribution,
"downloadURL": "file:///data/",
}

assert d1["@id"] == DS.d1
assert set(d1["@type"]) == {
DCAT.Dataset,
EMMO.DataSet,
ONTO.T1,
}
assert d1.inSeries == DS.s1
assert d1.distribution == {
"@type": DCAT.Distribution,
"downloadURL": "file:///data/d1.txt",
}

assert d2["@id"] == DS.d2
assert set(d2["@type"]) == {
DCAT.Dataset,
EMMO.DataSet,
ONTO.T2,
}
assert d2.inSeries == DS.s1
assert d2.distribution == {
"@type": DCAT.Distribution,
"downloadURL": "file:///data/d2.txt",
}

ts = Triplestore(backend="rdflib")
td.save(ts)
print(ts.serialize())
5 changes: 5 additions & 0 deletions tests/input/semdata.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
@id;@type;title;description;creator;contactPoint;inSeries;datamodel;datamodelStorage;distribution.downloadURL;distribution.mediaType;distribution.parser;fromSample;isDescriptionOf
semdata:SEM_cement_batch2/77600-23-001/77600-23-001_5kV_400x_m001;sem:SEMImage;SEM image of cement;Back-scattered SEM image of cement sample 77600 from Heidelberg, polished with 1 �m diamond compound.;Sigurd Wenner;Sigurd Wenner <Sigurd.Wenner@sintef.no>;semdata:SEM_cement_batch2/77600-23-001;http://onto-ns.com/meta/matchmaker/0.2/SEMImage;https://github.com/HEU-MatCHMaker/DataDocumentation/blob/master/SEM/datamodels/SEMImage.yaml;https://github.com/EMMC-ASBL/tripper/raw/refs/heads/dataset/tests/input/77600-23-001_5kV_400x_m001.tif;image/tiff;parser:sem_hitachi;sample:SEM_cement_batch2/77600-23-001;mat:concrete1
semdata:SEM_cement_batch2/77600-23-001;sem:SEMImageSeries;Series of SEM image of cement sample 77600;Back-scattered SEM image of cement sample 77600, polished with 1 �m diamond compound.;Sigurd Wenner;Sigurd Wenner <Sigurd.Wenner@sintef.no>;semdata:SEM_cement_batch2; ;;sftp://nas.aimen.es/P_MATCHMAKER_SHARE_SINTEF/SEM_cement_batch2/77600-23-001;inode/directory;;;
semdata:SEM_cement_batch2;sem:SEMImageSeries;Nested series of SEM images of cement batch2;�;Sigurd Wenner;Sigurd Wenner <Sigurd.Wenner@sintef.no>; ;;;sftp://nas.aimen.es/P_MATCHMAKER_SHARE_SINTEF/SEM_cement_batch2;inode/directory;;;
mple:SEM_cement_batch2/77600-23-001;chameo:Sample;Series for SEM images for sample 77600-23-001.; ;;;;;;;;;;
1 change: 1 addition & 0 deletions tripper/dataset/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,3 +12,4 @@
save_dict,
search_iris,
)
from .tabledoc import TableDoc
4 changes: 2 additions & 2 deletions tripper/dataset/dataaccess.py
Original file line number Diff line number Diff line change
Expand Up @@ -175,9 +175,9 @@ def save(
# Update triplestore
ts.add_triples(triples)
if save_dataset:
save_dict(ts, "dataset", dataset, prefixes=prefixes)
save_dict(ts, dataset, "dataset", prefixes=prefixes)
elif save_distribution:
save_dict(ts, "distribution", distribution, prefixes=prefixes)
save_dict(ts, distribution, "distribution", prefixes=prefixes)

return dataset["@id"]

Expand Down
Loading

0 comments on commit c84bae1

Please sign in to comment.