Merge branch 'Health-RI-healthdcat_ap'

ckan · Jan 9, 2025 · f9cd102 · f9cd102
2 parents 868b81e + d334bb3
commit f9cd102
Show file tree

Hide file tree

Showing 30 changed files with 2,072 additions and 217 deletions.
diff --git a/ckanext/dcat/profiles/__init__.py b/ckanext/dcat/profiles/__init__.py
@@ -25,4 +25,5 @@
 from .euro_dcat_ap_3 import EuropeanDCATAP3Profile
 from .dcat_us_3 import DCATUS3Profile
 from .euro_dcat_ap_scheming import EuropeanDCATAPSchemingProfile
+from .euro_health_dcat_ap import EuropeanHealthDCATAPProfile
 from .schemaorg import SchemaOrgProfile
diff --git a/ckanext/dcat/profiles/base.py b/ckanext/dcat/profiles/base.py
@@ -2,16 +2,16 @@
 import json
 from urllib.parse import quote
 
+from ckan.lib.helpers import resource_formats
+from ckan.model.license import LicenseRegister
+from ckantoolkit import ObjectNotFound, asbool, aslist, config, get_action, url_for
 from dateutil.parser import parse as parse_date
-from rdflib import term, URIRef, BNode, Literal
-from rdflib.namespace import Namespace, RDF, XSD, SKOS, RDFS, ORG
-from geomet import wkt, InvalidGeoJSONException
+from geomet import InvalidGeoJSONException, wkt
+from rdflib import BNode, Literal, URIRef, term
+from rdflib.namespace import ORG, RDF, RDFS, SKOS, XSD, Namespace
 
-from ckantoolkit import config, url_for, asbool, aslist, get_action, ObjectNotFound
-from ckan.model.license import LicenseRegister
-from ckan.lib.helpers import resource_formats
 from ckanext.dcat.utils import DCAT_EXPOSE_SUBCATALOGS
-from ckanext.dcat.validators import is_year, is_year_month, is_date
+from ckanext.dcat.validators import is_date, is_year, is_year_month
 
 CNT = Namespace("http://www.w3.org/2011/content#")
 DCT = Namespace("http://purl.org/dc/terms/")

diff --git a/ckanext/dcat/profiles/euro_dcat_ap_scheming.py b/ckanext/dcat/profiles/euro_dcat_ap_scheming.py
@@ -1,6 +1,6 @@
 import json
 
-from rdflib import URIRef, BNode, Literal
+from rdflib import URIRef, BNode, Literal, term
 from .base import RDFProfile, CleanedURIRef, URIRefOrLiteral
 from .base import (
     RDF,
@@ -10,6 +10,7 @@
     FOAF,
     SKOS,
     LOCN,
+    RDFS,
 )
 
 
@@ -118,6 +119,11 @@ def _parse_list_value(data_dict, field_name):
             if agents:
                 dataset_dict[key] = agents
 
+        # Add any qualifiedRelations
+        qual_relations = self._relationship_details(dataset_ref, DCAT.qualifiedRelation)
+        if qual_relations:
+            dataset_dict["qualified_relation"] = qual_relations
+
         # Repeating subfields: resources
         for schema_field in self._dataset_schema["resource_fields"]:
             if "repeating_subfields" in schema_field:
@@ -227,6 +233,10 @@ def _graph_from_dataset_v2_scheming(self, dataset_dict, dataset_ref):
                             spatial_ref, field[1], item[field[0]]
                         )
 
+        self._add_relationship(
+            dataset_ref, dataset_dict, "qualified_relation", DCAT.qualifiedRelation
+        )
+
         resources = dataset_dict.get("resources", [])
         for resource in resources:
             if resource.get("access_services"):
@@ -292,6 +302,80 @@ def _add_agents(
                     _type=URIRefOrLiteral,
                 )
 
+    def _relationship_details(self, subject, predicate):
+        """
+        Returns a list of dicts with details about a dcat:Relationship property, e.g.
+        dcat:qualifiedRelation
+
+        Both subject and predicate must be rdflib URIRef or BNode objects
+
+        Returns keys for uri, role, and relation with the values set to
+        an empty string if they could not be found.
+        """
+
+        relations = []
+        for relation in self.g.objects(subject, predicate):
+            relation_details = {}
+            relation_details["uri"] = (
+                str(relation) if isinstance(relation, term.URIRef) else ""
+            )
+            relation_details["role"] = self._object_value(relation, DCAT.hadRole)
+            relation_details["relation"] = self._object_value(relation, DCT.relation)
+            relations.append(relation_details)
+
+        return relations
+
+    def _add_relationship(
+        self,
+        dataset_ref,
+        dataset_dict,
+        relation_key,
+        rdf_predicate,
+    ):
+        """
+        Adds one or more Relationships to the RDF graph.
+
+        :param dataset_ref: The RDF reference of the dataset
+        :param dataset_dict: The dataset dictionary containing agent information
+        :param relation_key: field name in the CKAN dict (.e.g. "qualifiedRelation")
+        :param rdf_predicate: The RDF predicate (DCAT.qualifiedRelation)
+        """
+        relation = dataset_dict.get(relation_key)
+        if (
+            isinstance(relation, list)
+            and len(relation)
+            and self._not_empty_dict(relation[0])
+        ):
+            relations = relation
+
+            for relation in relations:
+
+                agent_uri = relation.get("uri")
+                if agent_uri:
+                    agent_ref = CleanedURIRef(agent_uri)
+                else:
+                    agent_ref = BNode()
+
+                self.g.add((agent_ref, RDF.type, DCAT.Relationship))
+                self.g.add((dataset_ref, rdf_predicate, agent_ref))
+
+                self._add_triple_from_dict(
+                    relation,
+                    agent_ref,
+                    DCT.relation,
+                    "relation",
+                    _type=URIRefOrLiteral,
+                    _class=RDFS.Resource,
+                )
+                self._add_triple_from_dict(
+                    relation,
+                    agent_ref,
+                    DCAT.hadRole,
+                    "role",
+                    _type=URIRefOrLiteral,
+                    _class=DCAT.Role,
+                )
+
     @staticmethod
     def _not_empty_dict(data_dict):
         return any(data_dict.values())
diff --git a/ckanext/dcat/profiles/euro_health_dcat_ap.py b/ckanext/dcat/profiles/euro_health_dcat_ap.py
@@ -0,0 +1,152 @@
+from rdflib import XSD, Literal, URIRef
+from rdflib.namespace import Namespace
+
+from ckanext.dcat.profiles.base import URIRefOrLiteral
+from ckanext.dcat.profiles.euro_dcat_ap_3 import EuropeanDCATAP3Profile
+
+# HealthDCAT-AP namespace. Note: not finalized yet
+HEALTHDCATAP = Namespace("http://healthdataportal.eu/ns/health#")
+
+# Data Privacy Vocabulary namespace
+DPV = Namespace("https://w3id.org/dpv#")
+
+namespaces = {
+    "healthdcatap": HEALTHDCATAP,
+    "dpv": DPV,
+}
+
+
+class EuropeanHealthDCATAPProfile(EuropeanDCATAP3Profile):
+    """
+    A profile implementing HealthDCAT-AP, a health-related extension of the DCAT
+    application profile for sharing information about Catalogues containing Datasets
+    and Data Services descriptions in Europe.
+    """
+
+    def parse_dataset(self, dataset_dict, dataset_ref):
+        # Call super method for DCAT-AP 3 properties
+        dataset_dict = super(EuropeanHealthDCATAPProfile, self).parse_dataset(
+            dataset_dict, dataset_ref
+        )
+
+        dataset_dict = self._parse_health_fields(dataset_dict, dataset_ref)
+
+        return dataset_dict
+
+    def _parse_health_fields(self, dataset_dict, dataset_ref):
+        self.__parse_healthdcat_stringvalues(dataset_dict, dataset_ref)
+
+        self.__parse_healthdcat_intvalues(dataset_dict, dataset_ref)
+
+        # Add the HDAB. There should only ever be one but you never know
+        agents = self._agents_details(dataset_ref, HEALTHDCATAP.hdab)
+        if agents:
+            dataset_dict["hdab"] = agents
+
+        # Retention period
+        retention_start, retention_end = self._time_interval(
+            dataset_ref, HEALTHDCATAP.retentionPeriod, dcat_ap_version=2
+        )
+        retention_dict = {}
+        if retention_start is not None:
+            retention_dict["start"] = retention_start
+        if retention_end is not None:
+            retention_dict["end"] = retention_end
+        if retention_dict:
+            dataset_dict["retention_period"] = [retention_dict]
+
+        return dataset_dict
+
+    def __parse_healthdcat_intvalues(self, dataset_dict, dataset_ref):
+        for key, predicate in (
+            ("min_typical_age", HEALTHDCATAP.minTypicalAge),
+            ("max_typical_age", HEALTHDCATAP.maxTypicalAge),
+            ("number_of_records", HEALTHDCATAP.numberOfRecords),
+            ("number_of_unique_individuals", HEALTHDCATAP.numberOfUniqueIndividuals),
+        ):
+            value = self._object_value_int(dataset_ref, predicate)
+            # A zero value evaluates as False but is definitely not a None
+            if value is not None:
+                dataset_dict[key] = value
+
+    def __parse_healthdcat_stringvalues(self, dataset_dict, dataset_ref):
+        for (key, predicate,) in (
+            ("analytics", HEALTHDCATAP.analytics),
+            ("code_values", HEALTHDCATAP.hasCodeValues),
+            ("coding_system", HEALTHDCATAP.hasCodingSystem),
+            ("health_category", HEALTHDCATAP.healthCategory),
+            ("health_theme", HEALTHDCATAP.healthTheme),
+            ("legal_basis", DPV.hasLegalBasis),
+            ("personal_data", DPV.hasPersonalData),
+            ("population_coverage", HEALTHDCATAP.populationCoverage),
+            ("publisher_note", HEALTHDCATAP.publisherNote),
+            ("publisher_type", HEALTHDCATAP.publisherType),
+            ("purpose", DPV.hasPurpose),
+        ):
+            values = self._object_value_list(dataset_ref, predicate)
+            if values:
+                dataset_dict[key] = values
+
+    def graph_from_dataset(self, dataset_dict, dataset_ref):
+        super().graph_from_dataset(dataset_dict, dataset_ref)
+        for prefix, namespace in namespaces.items():
+            self.g.bind(prefix, namespace)
+
+        # key, predicate, fallbacks, _type, _class
+        items = [
+            ("analytics", HEALTHDCATAP.analytics, None, URIRefOrLiteral),
+            ("code_values", HEALTHDCATAP.hasCodeValues, None, URIRefOrLiteral),
+            ("coding_system", HEALTHDCATAP.hasCodingSystem, None, URIRefOrLiteral),
+            ("health_category", HEALTHDCATAP.healthCategory, None, URIRefOrLiteral),
+            ("health_theme", HEALTHDCATAP.healthCategory, None, URIRefOrLiteral),
+            ("legal_basis", DPV.hasLegalBasis, None, URIRefOrLiteral),
+            (
+                "population_coverage",
+                HEALTHDCATAP.populationCoverage,
+                None,
+                URIRefOrLiteral,
+            ),
+            ("personal_data", DPV.hasPersonalData, None, URIRef),
+            ("publisher_note", HEALTHDCATAP.publisherNote, None, URIRefOrLiteral),
+            ("publisher_type", HEALTHDCATAP.publisherType, None, URIRefOrLiteral),
+            ("purpose", DPV.hasPurpose, None, URIRefOrLiteral),
+        ]
+        self._add_list_triples_from_dict(dataset_dict, dataset_ref, items)
+
+        items = [
+            ("min_typical_age", HEALTHDCATAP.minTypicalAge),
+            ("max_typical_age", HEALTHDCATAP.maxTypicalAge),
+            ("number_of_records", HEALTHDCATAP.numberOfRecords),
+            ("number_of_unique_individuals", HEALTHDCATAP.numberOfUniqueIndividuals),
+        ]
+        for key, predicate in items:
+            self._add_nonneg_integer_triple(dataset_dict, dataset_ref, key, predicate)
+
+        self._add_agents(dataset_ref, dataset_dict, "hdab", HEALTHDCATAP.hdab)
+
+    def _add_nonneg_integer_triple(self, dataset_dict, dataset_ref, key, predicate):
+        """
+        Adds non-negative integers to the Dataset graph (xsd:nonNegativeInteger)
+
+        dataset_ref: subject of Graph
+        key: scheming key in CKAN
+        predicate: predicate to use
+        """
+        value = self._get_dict_value(dataset_dict, key)
+
+        if value:
+            try:
+                if int(value) < 0:
+                    raise ValueError("Not a non-negative integer")
+                self.g.add(
+                    (
+                        dataset_ref,
+                        predicate,
+                        Literal(int(value), datatype=XSD.nonNegativeInteger),
+                    )
+                )
+            except (ValueError, TypeError):
+                self.g.add((dataset_ref, predicate, Literal(value)))
+
+    def graph_from_catalog(self, catalog_dict, catalog_ref):
+        super().graph_from_catalog(catalog_dict, catalog_ref)
diff --git a/ckanext/dcat/schemas/dcat_ap_full.yaml b/ckanext/dcat/schemas/dcat_ap_full.yaml
@@ -268,6 +268,23 @@ dataset_fields:
   help_inline: true
   help_text: This property refers to a related Dataset that is a version, edition, or adaptation of the described Dataset.
 
+- field_name: qualified_relation
+  label: Qualified relation
+  repeating_label: Relationship
+  repeating_subfields:
+
+    - field_name: uri
+      label: URI
+
+    - field_name: relation
+      label: Relation
+      help_text: The resource related to the source resource.
+
+    - field_name: role
+      label: Role
+      help_text: The function of an entity or agent with respect to another entity or resource.
+  help_text: A description of a relationship with another resource.
+
 #- field_name: hvd_category
 #  label: HVD Category
 #  preset: multiple_text

diff --git a/ckanext/dcat/schemas/dcat_us_full.yaml b/ckanext/dcat/schemas/dcat_us_full.yaml
@@ -331,6 +331,23 @@ dataset_fields:
     - field_name: license
       label: License
 
+- field_name: qualified_relation
+  label: Qualified relation
+  repeating_label: Relationship
+  repeating_subfields:
+
+    - field_name: uri
+      label: URI
+
+    - field_name: relation
+      label: Relation
+      help_text: The resource related to the source resource.
+
+    - field_name: role
+      label: Role
+      help_text: The function of an entity or agent with respect to another entity or resource.
+  help_text: A description of a relationship with another resource.
+
 # Note: if not provided, this will be autogenerated
 - field_name: uri
   label: URI