New TableDoc class providing a table interface for data documentation (…

…#273) # Description Added the TableDoc class providing a table interface for data documentation. --------- Co-authored-by: Tor S. Haugland <torshaugland@gmail.com>
EMMC-ASBL · Jan 3, 2025 · c84bae1 · c84bae1
1 parent f040624
commit c84bae1
Show file tree

Hide file tree

Showing 11 changed files with 296 additions and 44 deletions.
diff --git a/docs/api_reference/dataset/tabledoc.md b/docs/api_reference/dataset/tabledoc.md
@@ -0,0 +1,3 @@
+# tabledoc
+
+::: tripper.dataset.tabledoc
diff --git a/pyproject.toml b/pyproject.toml
@@ -104,7 +104,7 @@ max-public-methods = 25
 max-locals = 20
 disable = [
     "fixme",
-    "too-many-positional-arguments",
+    "invalid-name",
 ]
 good-names = [
     # Default
@@ -115,8 +115,8 @@ good-names = [
     "s", "p", "o",
     # Namespaces
     "EX",
-    # dict, value, file, ...
-    "d", "v", "f",
+    # dict, value, file, keyword...
+    "d", "v", "f", "kw",
 ]
 
 [tool.pytest.ini_options]

diff --git a/tests/dataset/dataset_paths.py b/tests/dataset/dataset_paths.py
@@ -0,0 +1,12 @@
+"""Defines paths for tests.
+
+It defines some directories and some utility functions that can be used
+with or without conftest.
+"""
+
+from pathlib import Path
+
+testdir = Path(__file__).resolve().parent.parent
+ontodir = testdir / "ontologies"
+indir = testdir / "input"
+outdir = testdir / "output"
diff --git a/tests/dataset/test_dataaccess.py b/tests/dataset/test_dataaccess.py
@@ -2,24 +2,19 @@
 
 # pylint: disable=invalid-name,too-many-locals,duplicate-code
 
-from pathlib import Path
-
 import pytest
 
 pytest.importorskip("yaml")
 pytest.importorskip("requests")
 
-thisdir = Path(__file__).resolve().parent
-testdir = thisdir.parent
-inputdir = testdir / "input"
-outputdir = testdir / "output"
-
 
 # if True:
 def test_save_and_load():
     """Test save() and load()."""
     # pylint: disable=too-many-statements
 
+    from dataset_paths import outdir  # pylint: disable=import-error
+
     from tripper import DCAT, DCTERMS, EMMO, Triplestore
     from tripper.dataset import load, load_dict, save, save_dict
 
@@ -38,7 +33,6 @@ def test_save_and_load():
     # Test save dict
     save_dict(
         ts,
-        type="dataset",
         dct={
             "@id": SEMDATA.img1,
             "distribution": {
@@ -49,6 +43,7 @@ def test_save_and_load():
                 "format": "tiff",
             },
         },
+        type="dataset",
     )
     newdistr = load_dict(ts, SEMDATA.img1)
     assert newdistr["@type"] == [DCAT.Dataset, EMMO.DataSet]
@@ -57,20 +52,20 @@ def test_save_and_load():
 
     save_dict(
         ts,
-        type="generator",
         dct={
             "@id": GEN.sem_hitachi,
             "generatorType": "application/vnd.dlite-generate",
             "configuration": {"driver": "hitachi"},
         },
+        type="generator",
     )
 
     # Test load dataset (this downloads an actual image from github)
     data = load(ts, iri)
     assert len(data) == 53502
 
     # Test save dataset with anonymous distribution
-    newfile = outputdir / "newimage.tiff"
+    newfile = outdir / "newimage.tiff"
     newfile.unlink(missing_ok=True)
     buf = b"some bytes..."
     save(
@@ -94,7 +89,7 @@ def test_save_and_load():
     assert newimage.distribution.downloadURL == f"file:{newfile}"
 
     # Test save dataset with named distribution
-    newfile2 = outputdir / "newimage.png"
+    newfile2 = outdir / "newimage.png"
     newfile2.unlink(missing_ok=True)
     save(
         ts,

diff --git a/tests/dataset/test_dataset.py b/tests/dataset/test_dataset.py
@@ -2,18 +2,11 @@
 
 # pylint: disable=invalid-name,too-many-locals,duplicate-code
 
-from pathlib import Path
-
 import pytest
 
 pytest.importorskip("yaml")
 pytest.importorskip("requests")
 
-thisdir = Path(__file__).resolve().parent
-testdir = thisdir.parent
-inputdir = testdir / "input"
-outputdir = testdir / "output"
-
 
 def test_get_jsonld_context():
     """Test get_jsonld_context()."""
@@ -73,12 +66,31 @@ def test_add():
     from tripper.dataset.dataset import add
 
     d = {}
-    add(d, "a", 1)
-    add(d, "b", 1)
-    add(d, "b", 1)
-    add(d, "a", 2)
-    add(d, "a", 1)
-    assert d == {"a": [1, 2], "b": 1}
+    add(d, "a", "1")
+    add(d, "b", "1")
+    add(d, "b", "1")
+    add(d, "a", "2")
+    add(d, "a", "1")
+    add(d, "a", {"c": "3"})
+    assert d == {"a": ["1", "2", {"c": "3"}], "b": "1"}
+
+
+def test_addnested():
+    """Test help-function addnested()."""
+    from tripper.dataset.dataset import addnested
+    from tripper.utils import AttrDict
+
+    d = AttrDict()
+    addnested(d, "a.b", "1")
+    assert d == {"a": {"b": "1"}}
+
+    addnested(d, "a", "2")
+    assert d == {"a": ["2", {"b": "1"}]}
+
+    addnested(d, "a.b.c", {"d": "3"})
+    assert d.a[0] == "2"
+    assert d.a[1].b[1].c == {"d": "3"}
+    assert d == {"a": ["2", {"b": ["1", {"c": {"d": "3"}}]}]}
 
 
 def test_get():
@@ -115,6 +127,8 @@ def test_datadoc():
     """Test save_datadoc() and load_dict()/save_dict()."""
     # pylint: disable=too-many-statements
 
+    from dataset_paths import indir  # pylint: disable=import-error
+
     from tripper import CHAMEO, DCAT, EMMO, OTEIO, Triplestore
     from tripper.dataset import load_dict, save_datadoc, save_dict, search_iris
 
@@ -124,7 +138,7 @@ def test_datadoc():
     ts = Triplestore("rdflib")
 
     # Load data documentation into triplestore
-    datadoc = save_datadoc(ts, inputdir / "semdata.yaml")
+    datadoc = save_datadoc(ts, indir / "semdata.yaml")
     assert isinstance(datadoc, dict)
     assert "@context" in datadoc
 
@@ -167,8 +181,8 @@ def test_datadoc():
     # Test save dict
     save_dict(
         ts,
-        "distribution",
-        {"@id": SEMDATA.newdistr, "format": "txt"},
+        dct={"@id": SEMDATA.newdistr, "format": "txt"},
+        type="distribution",
         prefixes={"echem": "https://w3id.org/emmo/domain/electrochemistry"},
     )
     newdistr = load_dict(ts, SEMDATA.newdistr)
@@ -206,11 +220,13 @@ def test_pipeline():
     from tripper import Triplestore
 
     otelib = pytest.importorskip("otelib")
+    from dataset_paths import indir  # pylint: disable=import-error
+
     from tripper.dataset import get_partial_pipeline, save_datadoc
 
     # Prepare triplestore
     ts = Triplestore("rdflib")
-    save_datadoc(ts, inputdir / "semdata.yaml")
+    save_datadoc(ts, indir / "semdata.yaml")
 
     SEMDATA = ts.namespaces["semdata"]
 

diff --git a/tests/dataset/test_tabledoc.py b/tests/dataset/test_tabledoc.py
@@ -0,0 +1,83 @@
+"""Test the TableDoc class."""
+
+import pytest
+
+
+# if True:
+def test_as_dicts():
+    """Test the as_dicts() method."""
+
+    pytest.importorskip("rdflib")
+
+    from tripper import DCAT, EMMO, Namespace, Triplestore
+    from tripper.dataset import TableDoc
+
+    ONTO = Namespace("http:/example.com/onto#")
+    DS = Namespace("http:/example.com/datasets#")
+
+    td = TableDoc(
+        header=[
+            "@id",
+            "@type",
+            "@type",
+            "inSeries",
+            "distribution.downloadURL",
+        ],
+        data=[
+            ("ds:s1", "onto:T1", "onto:T2", None, "file:///data/"),
+            ("ds:d1", "onto:T1", None, "ds:s1", "file:///data/d1.txt"),
+            ("ds:d2", "onto:T2", None, "ds:s1", "file:///data/d2.txt"),
+        ],
+        prefixes={
+            "onto": "http:/example.com/onto#",
+            "ds": "http:/example.com/datasets#",
+        },
+        # Replace the "ds" prefix above with this, once the "context" keyword
+        # argument is fully implemented.
+        # context={
+        #    "ds": "http:/example.com/datasets#",
+        # },
+    )
+
+    s1, d1, d2 = td.asdicts()  # pylint: disable=unbalanced-tuple-unpacking
+
+    assert s1["@id"] == DS.s1
+    assert set(s1["@type"]) == {
+        DCAT.Dataset,
+        EMMO.DataSet,
+        ONTO.T1,
+        ONTO.T2,
+    }
+    assert "inSeries" not in s1
+    assert s1.distribution == {
+        "@type": DCAT.Distribution,
+        "downloadURL": "file:///data/",
+    }
+
+    assert d1["@id"] == DS.d1
+    assert set(d1["@type"]) == {
+        DCAT.Dataset,
+        EMMO.DataSet,
+        ONTO.T1,
+    }
+    assert d1.inSeries == DS.s1
+    assert d1.distribution == {
+        "@type": DCAT.Distribution,
+        "downloadURL": "file:///data/d1.txt",
+    }
+
+    assert d2["@id"] == DS.d2
+    assert set(d2["@type"]) == {
+        DCAT.Dataset,
+        EMMO.DataSet,
+        ONTO.T2,
+    }
+    assert d2.inSeries == DS.s1
+    assert d2.distribution == {
+        "@type": DCAT.Distribution,
+        "downloadURL": "file:///data/d2.txt",
+    }
+
+    ts = Triplestore(backend="rdflib")
+    td.save(ts)
+    print(ts.serialize())
diff --git a/tests/input/semdata.csv b/tests/input/semdata.csv
@@ -0,0 +1,5 @@
+@id;@type;title;description;creator;contactPoint;inSeries;datamodel;datamodelStorage;distribution.downloadURL;distribution.mediaType;distribution.parser;fromSample;isDescriptionOf
+semdata:SEM_cement_batch2/77600-23-001/77600-23-001_5kV_400x_m001;sem:SEMImage;SEM image of cement;Back-scattered SEM image of cement sample 77600 from Heidelberg, polished with 1 �m diamond compound.;Sigurd Wenner;Sigurd Wenner <Sigurd.Wenner@sintef.no>;semdata:SEM_cement_batch2/77600-23-001;http://onto-ns.com/meta/matchmaker/0.2/SEMImage;https://github.com/HEU-MatCHMaker/DataDocumentation/blob/master/SEM/datamodels/SEMImage.yaml;https://github.com/EMMC-ASBL/tripper/raw/refs/heads/dataset/tests/input/77600-23-001_5kV_400x_m001.tif;image/tiff;parser:sem_hitachi;sample:SEM_cement_batch2/77600-23-001;mat:concrete1
+semdata:SEM_cement_batch2/77600-23-001;sem:SEMImageSeries;Series of SEM image of cement sample 77600;Back-scattered SEM image of cement sample 77600, polished with 1 �m diamond compound.;Sigurd Wenner;Sigurd Wenner <Sigurd.Wenner@sintef.no>;semdata:SEM_cement_batch2; ;;sftp://nas.aimen.es/P_MATCHMAKER_SHARE_SINTEF/SEM_cement_batch2/77600-23-001;inode/directory;;;
+semdata:SEM_cement_batch2;sem:SEMImageSeries;Nested series of SEM images of cement batch2;�;Sigurd Wenner;Sigurd Wenner <Sigurd.Wenner@sintef.no>; ;;;sftp://nas.aimen.es/P_MATCHMAKER_SHARE_SINTEF/SEM_cement_batch2;inode/directory;;;
+mple:SEM_cement_batch2/77600-23-001;chameo:Sample;Series for SEM images for sample 77600-23-001.; ;;;;;;;;;;
diff --git a/tripper/dataset/__init__.py b/tripper/dataset/__init__.py
@@ -12,3 +12,4 @@
     save_dict,
     search_iris,
 )
+from .tabledoc import TableDoc
diff --git a/tripper/dataset/dataaccess.py b/tripper/dataset/dataaccess.py
@@ -175,9 +175,9 @@ def save(
     # Update triplestore
     ts.add_triples(triples)
     if save_dataset:
-        save_dict(ts, "dataset", dataset, prefixes=prefixes)
+        save_dict(ts, dataset, "dataset", prefixes=prefixes)
     elif save_distribution:
-        save_dict(ts, "distribution", distribution, prefixes=prefixes)
+        save_dict(ts, distribution, "distribution", prefixes=prefixes)
 
     return dataset["@id"]