Skip to content

Commit

Permalink
Harvest dct:conformsTo for Dataset and Resource (#2949)
Browse files Browse the repository at this point in the history
* Update schemas to embed document and improve validation

* fixing tests

* Fixing tests

* Fix mocks in tests

* Fix test

* Remove mongoengine check since schema is allow to be empty and allow to have both values

* Do not auto respond with URL/name if missing

* Harvest schemas from conformsTo

* Small tweaks

* Add changelog

* Review fixes

* Move mock data to factories

* add comments for magic method of field validation

* Add schema in mask

* fix version check in form

* Set the name and version instead of URL for known schemas in harvesting

* Remove getters in Schema

* Fix tests

* Move validation to model

* Fix canont remove schema

* Add update tests

* Fix validation of schema with only version
  • Loading branch information
ThibaudDauce authored Feb 21, 2024
1 parent 938c703 commit 88ddde5
Show file tree
Hide file tree
Showing 16 changed files with 469 additions and 205 deletions.
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
- Organization can nom define a custom metadata of a choosen type
- Dataset belonging to the organization can assign a value to the defined metadata
- Metadata value must match the choosen type by the organization
- Harvest DCAT conformsTo into schemas for resources and datasets
- Better reporting in spam detection (show the writer of the discussion/message) [#2965](https://github.com/opendatateam/udata/pull/2965)
- Fix: spam lang detection not lowering input resulting in false positives [#2965](https://github.com/opendatateam/udata/pull/2965)
- Fix: do not send mail about discussions when there is no owner / no organisation members [#2962](https://github.com/opendatateam/udata/pull/2962)
Expand All @@ -22,6 +23,7 @@
- Add downloads count in datasets' CSV [#2953](https://github.com/opendatateam/udata/pull/2953)
- Allow dicts in datasets' extras [#2958](https://github.com/opendatateam/udata/pull/2958)


## 7.0.2 (2024-01-23)

- Improve search serialization perfs for datasets in big topics [#2937](https://github.com/opendatateam/udata/pull/2937)
Expand Down
2 changes: 1 addition & 1 deletion js/components/dataset/resource/form.vue
Original file line number Diff line number Diff line change
Expand Up @@ -329,7 +329,7 @@ export default {
let el = this.$refs.form.$form.querySelector("select[name='schema.name']");
if (! el) return {}
return el.value ? { schema: { name: el.value } } : { schema: null };
return { schema: { name: el.value ? el.value : null } };
},
validate() {
return this.$refs.form.validate();
Expand Down
4 changes: 2 additions & 2 deletions udata/core/dataset/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@
resource_fields,
resource_type_fields,
upload_fields,
schema_fields,
catalog_schema_fields,
)
from udata.linkchecker.checker import check_resource
from udata.core.topic.models import Topic
Expand Down Expand Up @@ -728,7 +728,7 @@ def get(self):
@ns.route('/schemas/', endpoint='schemas')
class SchemasAPI(API):
@api.doc('schemas')
@api.marshal_list_with(schema_fields)
@api.marshal_list_with(catalog_schema_fields)
def get(self):
'''List all available schemas'''
try:
Expand Down
16 changes: 13 additions & 3 deletions udata/core/dataset/api_fields.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,13 @@
required=True)
})

# Use for schema inside Dataset or Resource
schema_fields = api.model('Schema', {
'name': fields.String(),
'version': fields.String(),
'url': fields.String(),
})

dataset_harvest_fields = api.model('HarvestDatasetMetadata', {
'backend': fields.String(description='Harvest backend used', allow_null=True),
'created_at': fields.ISODateTime(description='The dataset harvested creation date',
Expand Down Expand Up @@ -119,7 +126,8 @@
'loaded as a standalone page (ie. iframe or '
'new page)',
readonly=True),
'schema': fields.Raw(description='Reference to the associated schema', readonly=True),
'schema': fields.Nested(
schema_fields, allow_null=True, description='Reference to the associated schema'),
'internal': fields.Nested(
resource_internal_fields, readonly=True, description='Site internal and specific object\'s data'),
})
Expand Down Expand Up @@ -171,7 +179,7 @@
DEFAULT_MASK = ','.join((
'id', 'title', 'acronym', 'slug', 'description', 'created_at', 'last_modified', 'deleted',
'private', 'tags', 'badges', 'resources', 'frequency', 'frequency_date', 'extras', 'harvest',
'metrics', 'organization', 'owner', 'temporal_coverage', 'spatial', 'license',
'metrics', 'organization', 'owner', 'schema', 'temporal_coverage', 'spatial', 'license',
'uri', 'page', 'last_update', 'archived', 'quality', 'internal', 'contact_point',
))

Expand Down Expand Up @@ -245,6 +253,8 @@
'quality': fields.Raw(description='The dataset quality', readonly=True),
'last_update': fields.ISODateTime(
description='The resources last modification date', required=True),
'schema': fields.Nested(
schema_fields, allow_null=True, description='Reference to the associated schema'),
'internal': fields.Nested(
dataset_internal_fields, readonly=True, description='Site internal and specific object\'s data'),
'contact_point': fields.Nested(contact_point_fields, allow_null=True, description='The dataset\'s contact points'),
Expand Down Expand Up @@ -273,7 +283,7 @@
})


schema_fields = api.model('Schema', {
catalog_schema_fields = api.model('CatalogSchema', {
'id': fields.String(description='The schema identifier'),
'label': fields.String(description='The schema display name'),
'versions': fields.List(fields.String, description='The available versions of the schema'),
Expand Down
6 changes: 5 additions & 1 deletion udata/core/dataset/apiv2.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,9 @@
dataset_harvest_fields,
dataset_internal_fields,
resource_harvest_fields,
resource_internal_fields
resource_internal_fields,
catalog_schema_fields,
schema_fields
)
from udata.core.spatial.api_fields import geojson
from udata.core.contact_point.api_fields import contact_point_fields
Expand Down Expand Up @@ -174,6 +176,8 @@
apiv2.inherit('DatasetInternals', dataset_internal_fields)
apiv2.inherit('ResourceInternals', resource_internal_fields)
apiv2.inherit('ContactPoint', contact_point_fields)
apiv2.inherit('Schema', schema_fields)
apiv2.inherit('CatalogSchema', catalog_schema_fields)


@ns.route('/search/', endpoint='dataset_search')
Expand Down
33 changes: 33 additions & 0 deletions udata/core/dataset/factories.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,9 @@
import factory

import json
from os.path import join

from udata.app import ROOT_DIR
from udata.factories import ModelFactory

from .models import Dataset, Resource, Checksum, CommunityResource, License
Expand Down Expand Up @@ -72,3 +76,32 @@ class Meta:
id = factory.Faker('unique_string')
title = factory.Faker('sentence')
url = factory.Faker('uri')

class ResourceSchemaMockData():
@staticmethod
def get_mock_data():
return json.load(open(join(ROOT_DIR, 'tests', 'schemas.json')))

@staticmethod
def get_expected_v1_result_from_mock_data():
return [
{
"id": "etalab/schema-irve-statique",
"label": "IRVE statique",
"versions": [
"2.2.0",
"2.2.1"
]
},
{
"id": "139bercy/format-commande-publique",
"label": "Données essentielles des marchés publics français",
"versions": [
"1.3.0",
"1.4.0",
"1.5.0",
"2.0.0",
"2.0.1"
]
}
]
73 changes: 22 additions & 51 deletions udata/core/dataset/forms.py
Original file line number Diff line number Diff line change
@@ -1,23 +1,21 @@
from urllib.parse import urlparse

from flask import current_app
from mongoengine import ValidationError

from udata.forms import ModelForm, fields, validators
from udata.i18n import lazy_gettext as _
from udata.uris import validate as validate_url, ValidationError

from udata.core.storages import resources
from udata.core.spatial.forms import SpatialCoverageField

from .models import (
Dataset, Resource, License, Checksum, CommunityResource,
Dataset, Resource, Schema, License, Checksum, CommunityResource,
UPDATE_FREQUENCIES, DEFAULT_FREQUENCY, RESOURCE_FILETYPES, CHECKSUM_TYPES,
LEGACY_FREQUENCIES, RESOURCE_TYPES, TITLE_SIZE_LIMIT, DESCRIPTION_SIZE_LIMIT,
ResourceSchema,
)

__all__ = ('DatasetForm', 'ResourceForm', 'CommunityResourceForm')

from ...models import FieldValidationError


class ChecksumForm(ModelForm):
model_class = Checksum
Expand All @@ -32,46 +30,23 @@ def normalize_format(data):
return data.strip().lower()


def enforce_allowed_schemas(form, field):
schema = field.data
if schema:
allowed_schemas = [s['id'] for s in ResourceSchema.objects()]

if not bool('name' in schema) ^ bool('url' in schema):
raise validators.ValidationError(_('Schema must have at least a name or an url. Having both is not allowed.'))

if 'url' in schema:
try:
validate_url(schema.get('url'))
except ValidationError:
raise validators.ValidationError(_('Provided URL is not valid.'))

if 'name' in schema and schema.get('name') not in allowed_schemas:
message = _('Schema name "{schema}" is not an allowed value. Allowed values: {values}')
raise validators.ValidationError(message.format(
schema=schema.get('name'),
values=', '.join(allowed_schemas)
))

schema_versions = [d['versions'] for d in ResourceSchema.objects() if d['id'] == schema.get('name')]
allowed_versions = schema_versions[0] if schema_versions else []
allowed_versions.append('latest')
if 'version' in schema:
if schema.get('version') not in allowed_versions:
message = _('Version "{version}" is not an allowed value. Allowed values: {values}')
raise validators.ValidationError(message.format(
version=schema.get('version'),
values=', '.join(allowed_versions)
))

properties = ['name', 'version', 'url']
for prop in schema:
if prop not in properties:
message = _('Sub-property "{prop}" is not allowed value in schema field. Allowed values is : {properties}')
raise validators.ValidationError(message.format(
prop=prop,
properties=', '.join(properties),
))
class SchemaForm(ModelForm):
model_class = Schema
url = fields.URLField(_('URL of the schema'))
name = fields.StringField(_('Name of the schema'))
version = fields.StringField(_('Version of the schema'))

def validate(self, extra_validators = None):
validation = super().validate(extra_validators)

try:
Schema(url=self.url.data, name=self.name.data, version=self.version.data).clean()
except FieldValidationError as err:
field = getattr(self, err.field)
field.errors.append(err.message)
return False

return validation


class BaseResourceForm(ModelForm):
Expand Down Expand Up @@ -103,11 +78,7 @@ class BaseResourceForm(ModelForm):
_('Size'), [validators.optional()],
description=_('The file size in bytes'))
extras = fields.ExtrasField()
schema = fields.DictField(
_('Schema'),
default={},
validators=[validators.optional(), enforce_allowed_schemas],
description=_('The schema slug the resource adheres to'))
schema = fields.FormField(SchemaForm)


class ResourceForm(BaseResourceForm):
Expand Down
Loading

0 comments on commit 88ddde5

Please sign in to comment.