Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Harvest dct:conformsTo for Dataset and Resource #2949

Merged
merged 24 commits into from
Feb 21, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
ef6a436
Update schemas to embed document and improve validation
ThibaudDauce Jan 25, 2024
85779b4
fixing tests
ThibaudDauce Jan 25, 2024
83ce96d
Fixing tests
ThibaudDauce Jan 25, 2024
a1a0754
Fix mocks in tests
ThibaudDauce Jan 25, 2024
cc60e9b
Fix test
ThibaudDauce Jan 25, 2024
fad29a5
Remove mongoengine check since schema is allow to be empty and allow …
ThibaudDauce Jan 25, 2024
0313f52
Do not auto respond with URL/name if missing
ThibaudDauce Jan 25, 2024
9616c52
Harvest schemas from conformsTo
ThibaudDauce Jan 25, 2024
37286cc
Small tweaks
ThibaudDauce Jan 25, 2024
cf50a6e
Add changelog
ThibaudDauce Jan 30, 2024
dd5e7bd
Review fixes
ThibaudDauce Jan 30, 2024
ade966c
Move mock data to factories
ThibaudDauce Jan 30, 2024
6bdc328
add comments for magic method of field validation
ThibaudDauce Feb 6, 2024
49c66aa
Add schema in mask
ThibaudDauce Feb 6, 2024
0087b44
fix version check in form
ThibaudDauce Feb 6, 2024
0d925bd
Set the name and version instead of URL for known schemas in harvesting
ThibaudDauce Feb 6, 2024
b0ad1b7
Remove getters in Schema
ThibaudDauce Feb 6, 2024
1a8142e
Fix tests
ThibaudDauce Feb 6, 2024
0f863a0
Move validation to model
ThibaudDauce Feb 7, 2024
9c21310
Fix canont remove schema
ThibaudDauce Feb 8, 2024
966abbd
Add update tests
ThibaudDauce Feb 8, 2024
1102c54
Fix validation of schema with only version
ThibaudDauce Feb 8, 2024
1524ab1
Merge branch 'master' into harvest_schemas
ThibaudDauce Feb 15, 2024
a98593b
Merge branch 'master' into harvest_schemas
ThibaudDauce Feb 20, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
- Organization can nom define a custom metadata of a choosen type
- Dataset belonging to the organization can assign a value to the defined metadata
- Metadata value must match the choosen type by the organization
- Harvest DCAT conformsTo into schemas for resources and datasets
- Better reporting in spam detection (show the writer of the discussion/message) [#2965](https://github.com/opendatateam/udata/pull/2965)
- Fix: spam lang detection not lowering input resulting in false positives [#2965](https://github.com/opendatateam/udata/pull/2965)
- Fix: do not send mail about discussions when there is no owner / no organisation members [#2962](https://github.com/opendatateam/udata/pull/2962)
Expand All @@ -22,6 +23,7 @@
- Add downloads count in datasets' CSV [#2953](https://github.com/opendatateam/udata/pull/2953)
- Allow dicts in datasets' extras [#2958](https://github.com/opendatateam/udata/pull/2958)


## 7.0.2 (2024-01-23)

- Improve search serialization perfs for datasets in big topics [#2937](https://github.com/opendatateam/udata/pull/2937)
Expand Down
2 changes: 1 addition & 1 deletion js/components/dataset/resource/form.vue
Original file line number Diff line number Diff line change
Expand Up @@ -329,7 +329,7 @@ export default {
let el = this.$refs.form.$form.querySelector("select[name='schema.name']");
if (! el) return {}

return el.value ? { schema: { name: el.value } } : { schema: null };
return { schema: { name: el.value ? el.value : null } };
},
validate() {
return this.$refs.form.validate();
Expand Down
4 changes: 2 additions & 2 deletions udata/core/dataset/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@
resource_fields,
resource_type_fields,
upload_fields,
schema_fields,
catalog_schema_fields,
)
from udata.linkchecker.checker import check_resource
from udata.core.topic.models import Topic
Expand Down Expand Up @@ -728,7 +728,7 @@ def get(self):
@ns.route('/schemas/', endpoint='schemas')
class SchemasAPI(API):
@api.doc('schemas')
@api.marshal_list_with(schema_fields)
@api.marshal_list_with(catalog_schema_fields)
def get(self):
'''List all available schemas'''
try:
Expand Down
16 changes: 13 additions & 3 deletions udata/core/dataset/api_fields.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,13 @@
required=True)
})

# Use for schema inside Dataset or Resource
schema_fields = api.model('Schema', {
'name': fields.String(),
'version': fields.String(),
'url': fields.String(),
})

dataset_harvest_fields = api.model('HarvestDatasetMetadata', {
'backend': fields.String(description='Harvest backend used', allow_null=True),
'created_at': fields.ISODateTime(description='The dataset harvested creation date',
Expand Down Expand Up @@ -119,7 +126,8 @@
'loaded as a standalone page (ie. iframe or '
'new page)',
readonly=True),
'schema': fields.Raw(description='Reference to the associated schema', readonly=True),
'schema': fields.Nested(
schema_fields, allow_null=True, description='Reference to the associated schema'),
'internal': fields.Nested(
resource_internal_fields, readonly=True, description='Site internal and specific object\'s data'),
})
Expand Down Expand Up @@ -171,7 +179,7 @@
DEFAULT_MASK = ','.join((
'id', 'title', 'acronym', 'slug', 'description', 'created_at', 'last_modified', 'deleted',
'private', 'tags', 'badges', 'resources', 'frequency', 'frequency_date', 'extras', 'harvest',
'metrics', 'organization', 'owner', 'temporal_coverage', 'spatial', 'license',
'metrics', 'organization', 'owner', 'schema', 'temporal_coverage', 'spatial', 'license',
'uri', 'page', 'last_update', 'archived', 'quality', 'internal', 'contact_point',
))

Expand Down Expand Up @@ -245,6 +253,8 @@
'quality': fields.Raw(description='The dataset quality', readonly=True),
'last_update': fields.ISODateTime(
description='The resources last modification date', required=True),
'schema': fields.Nested(
schema_fields, allow_null=True, description='Reference to the associated schema'),
'internal': fields.Nested(
dataset_internal_fields, readonly=True, description='Site internal and specific object\'s data'),
'contact_point': fields.Nested(contact_point_fields, allow_null=True, description='The dataset\'s contact points'),
Expand Down Expand Up @@ -273,7 +283,7 @@
})


schema_fields = api.model('Schema', {
catalog_schema_fields = api.model('CatalogSchema', {
'id': fields.String(description='The schema identifier'),
'label': fields.String(description='The schema display name'),
'versions': fields.List(fields.String, description='The available versions of the schema'),
Expand Down
6 changes: 5 additions & 1 deletion udata/core/dataset/apiv2.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,9 @@
dataset_harvest_fields,
dataset_internal_fields,
resource_harvest_fields,
resource_internal_fields
resource_internal_fields,
catalog_schema_fields,
schema_fields
)
from udata.core.spatial.api_fields import geojson
from udata.core.contact_point.api_fields import contact_point_fields
Expand Down Expand Up @@ -174,6 +176,8 @@
apiv2.inherit('DatasetInternals', dataset_internal_fields)
apiv2.inherit('ResourceInternals', resource_internal_fields)
apiv2.inherit('ContactPoint', contact_point_fields)
apiv2.inherit('Schema', schema_fields)
apiv2.inherit('CatalogSchema', catalog_schema_fields)


@ns.route('/search/', endpoint='dataset_search')
Expand Down
33 changes: 33 additions & 0 deletions udata/core/dataset/factories.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,9 @@
import factory

import json
from os.path import join

from udata.app import ROOT_DIR
from udata.factories import ModelFactory

from .models import Dataset, Resource, Checksum, CommunityResource, License
Expand Down Expand Up @@ -72,3 +76,32 @@ class Meta:
id = factory.Faker('unique_string')
title = factory.Faker('sentence')
url = factory.Faker('uri')

class ResourceSchemaMockData():
@staticmethod
def get_mock_data():
return json.load(open(join(ROOT_DIR, 'tests', 'schemas.json')))

@staticmethod
def get_expected_v1_result_from_mock_data():
return [
{
"id": "etalab/schema-irve-statique",
"label": "IRVE statique",
"versions": [
"2.2.0",
"2.2.1"
]
},
{
"id": "139bercy/format-commande-publique",
"label": "Données essentielles des marchés publics français",
"versions": [
"1.3.0",
"1.4.0",
"1.5.0",
"2.0.0",
"2.0.1"
]
}
]
73 changes: 22 additions & 51 deletions udata/core/dataset/forms.py
Original file line number Diff line number Diff line change
@@ -1,23 +1,21 @@
from urllib.parse import urlparse

from flask import current_app
from mongoengine import ValidationError

from udata.forms import ModelForm, fields, validators
from udata.i18n import lazy_gettext as _
from udata.uris import validate as validate_url, ValidationError

from udata.core.storages import resources
from udata.core.spatial.forms import SpatialCoverageField

from .models import (
Dataset, Resource, License, Checksum, CommunityResource,
Dataset, Resource, Schema, License, Checksum, CommunityResource,
UPDATE_FREQUENCIES, DEFAULT_FREQUENCY, RESOURCE_FILETYPES, CHECKSUM_TYPES,
LEGACY_FREQUENCIES, RESOURCE_TYPES, TITLE_SIZE_LIMIT, DESCRIPTION_SIZE_LIMIT,
ResourceSchema,
)

__all__ = ('DatasetForm', 'ResourceForm', 'CommunityResourceForm')

from ...models import FieldValidationError


class ChecksumForm(ModelForm):
model_class = Checksum
Expand All @@ -32,46 +30,23 @@ def normalize_format(data):
return data.strip().lower()


def enforce_allowed_schemas(form, field):
schema = field.data
if schema:
allowed_schemas = [s['id'] for s in ResourceSchema.objects()]

if not bool('name' in schema) ^ bool('url' in schema):
raise validators.ValidationError(_('Schema must have at least a name or an url. Having both is not allowed.'))

if 'url' in schema:
try:
validate_url(schema.get('url'))
except ValidationError:
raise validators.ValidationError(_('Provided URL is not valid.'))

if 'name' in schema and schema.get('name') not in allowed_schemas:
message = _('Schema name "{schema}" is not an allowed value. Allowed values: {values}')
raise validators.ValidationError(message.format(
schema=schema.get('name'),
values=', '.join(allowed_schemas)
))

schema_versions = [d['versions'] for d in ResourceSchema.objects() if d['id'] == schema.get('name')]
allowed_versions = schema_versions[0] if schema_versions else []
allowed_versions.append('latest')
if 'version' in schema:
if schema.get('version') not in allowed_versions:
message = _('Version "{version}" is not an allowed value. Allowed values: {values}')
raise validators.ValidationError(message.format(
version=schema.get('version'),
values=', '.join(allowed_versions)
))

properties = ['name', 'version', 'url']
for prop in schema:
if prop not in properties:
message = _('Sub-property "{prop}" is not allowed value in schema field. Allowed values is : {properties}')
raise validators.ValidationError(message.format(
prop=prop,
properties=', '.join(properties),
))
class SchemaForm(ModelForm):
model_class = Schema
url = fields.URLField(_('URL of the schema'))
name = fields.StringField(_('Name of the schema'))
version = fields.StringField(_('Version of the schema'))

def validate(self, extra_validators = None):
validation = super().validate(extra_validators)

try:
Schema(url=self.url.data, name=self.name.data, version=self.version.data).clean()
except FieldValidationError as err:
field = getattr(self, err.field)
field.errors.append(err.message)
return False

return validation


class BaseResourceForm(ModelForm):
Expand Down Expand Up @@ -103,11 +78,7 @@ class BaseResourceForm(ModelForm):
_('Size'), [validators.optional()],
description=_('The file size in bytes'))
extras = fields.ExtrasField()
schema = fields.DictField(
_('Schema'),
default={},
validators=[validators.optional(), enforce_allowed_schemas],
description=_('The schema slug the resource adheres to'))
schema = fields.FormField(SchemaForm)


class ResourceForm(BaseResourceForm):
Expand Down
Loading