Skip to content

Commit

Permalink
Move validation to model
Browse files Browse the repository at this point in the history
  • Loading branch information
ThibaudDauce committed Feb 7, 2024
1 parent e3a7d82 commit 3340382
Show file tree
Hide file tree
Showing 8 changed files with 125 additions and 105 deletions.
67 changes: 14 additions & 53 deletions udata/core/dataset/forms.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,7 @@
from urllib.parse import urlparse

from flask import current_app
from mongoengine import ValidationError

from udata.forms import ModelForm, fields, validators
from udata.i18n import lazy_gettext as _
from udata.uris import validate as validate_url, ValidationError

from udata.core.storages import resources
from udata.core.spatial.forms import SpatialCoverageField
Expand All @@ -13,11 +10,12 @@
Dataset, Resource, Schema, License, Checksum, CommunityResource,
UPDATE_FREQUENCIES, DEFAULT_FREQUENCY, RESOURCE_FILETYPES, CHECKSUM_TYPES,
LEGACY_FREQUENCIES, RESOURCE_TYPES, TITLE_SIZE_LIMIT, DESCRIPTION_SIZE_LIMIT,
ResourceSchema,
)

__all__ = ('DatasetForm', 'ResourceForm', 'CommunityResourceForm')

from ...models import FieldValidationError


class ChecksumForm(ModelForm):
model_class = Checksum
Expand All @@ -38,54 +36,17 @@ class SchemaForm(ModelForm):
name = fields.StringField(_('Name of the schema'))
version = fields.StringField(_('Version of the schema'))

def validate_url(form, field):
'''
This method is auto called by WTForms for the URL
'''
if not field.data:
if form.name.data:
return
raise validators.ValidationError(_('URL is required when name is missing.'))

if form.name.data:
raise validators.ValidationError(_('Having both name and URL is not allowed.'))

def validate_name(form, field):
'''
This method is auto called by WTForms for the name
'''
if not field.data:
if form.url.data:
return
raise validators.ValidationError(_('Name is required when URL is missing.'))

if form.url.data:
raise validators.ValidationError(_('Having both name and URL is not allowed.'))

name = field.data
version = form.version.data

# If there is no URL, the name must match a known schema from our catalog.
existing_schema = ResourceSchema.get_schema_by_name(name)

if not existing_schema:
message = _('Schema name "{schema}" is not an allowed value. Allowed values: {values}')
raise validators.ValidationError(message.format(
schema=name,
values=', '.join(map(lambda schema: schema['name'], ResourceSchema.all()))
))

if version:
allowed_versions = list(map(lambda version: version['version_name'], existing_schema['versions']))
allowed_versions.append('latest')

if version not in allowed_versions:
message = _('Version "{version}" is not an allowed value for the schema "{name}". Allowed versions: {values}')
raise validators.ValidationError(message.format(
version=version,
name=name,
values=', '.join(allowed_versions)
))
def validate(self, extra_validators = None):
validation = super().validate(extra_validators)

try:
Schema(url=self.url.data, name=self.name.data, version=self.version.data).clean()
except FieldValidationError as err:
field = getattr(self, err.field)
field.errors.append(err.message)
return False

return validation


class BaseResourceForm(ModelForm):
Expand Down
54 changes: 47 additions & 7 deletions udata/core/dataset/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
from udata.app import cache
from udata.core import storages
from udata.frontend.markdown import mdstrip
from udata.models import db, WithMetrics, BadgeMixin, SpatialCoverage
from udata.models import db, WithMetrics, BadgeMixin, SpatialCoverage, FieldValidationError
from udata.i18n import lazy_gettext as _
from udata.utils import get_by, hash_url, to_naive_datetime
from udata.uris import ValidationError, endpoint_for
Expand Down Expand Up @@ -151,25 +151,65 @@ class HarvestResourceMetadata(DynamicEmbeddedDocument):


class Schema(db.EmbeddedDocument):
'''
"""
Schema can only be two things right now:
- Known schema: name is set, url is not set, version is maybe set
- Known schema: url is not set, name is set, version is maybe set
- Unknown schema: url is set, name and version are maybe set
'''
"""
url = db.URLField()
name = db.StringField()
version = db.StringField()

def __bool__(self):
'''
"""
In the database, since the schemas were only simple dicts, there is
empty `{}` stored. To prevent problems with converting to bool
empty `{}` stored. To prevent problems with converting to bool
(bool({}) and default bool(Schema) do not yield the same value), we transform
empty Schema() to False.
It's maybe not necessary but being paranoid here.
'''
"""
return bool(self.name) or bool(self.url)

def clean(self):
super().clean()

# First check if the URL is a known schema
if self.url:
info = ResourceSchema.get_existing_schema_info_by_url(self.url)
if info:
self.url = None
self.name = info[0]
self.version = info[1]
return

# We know this schema so we can do some checks
if not self.url:
if not self.name:
raise FieldValidationError(_('Name is required when URL is missing.'), field='name')

existing_schema = ResourceSchema.get_schema_by_name(self.name)
if not existing_schema:
message = _('Schema name "{schema}" is not an allowed value. Allowed values: {values}')
raise FieldValidationError(message.format(
schema=self.name,
values=', '.join(map(lambda schema: schema['name'], ResourceSchema.all()))
), field='name')

if self.version:
allowed_versions = list(map(lambda version: version['version_name'], existing_schema['versions']))
allowed_versions.append('latest')

if self.version not in allowed_versions:
message = _(
'Version "{version}" is not an allowed value for the schema "{name}". Allowed versions: {'
'values}')
raise FieldValidationError(message.format(
version=self.version,
name=self.name,
values=', '.join(allowed_versions)
), field='version')


class License(db.Document):
# We need to declare id explicitly since we do not use the default
# value set by Mongo.
Expand Down
7 changes: 4 additions & 3 deletions udata/harvest/tests/dcat/bnodes.xml
Original file line number Diff line number Diff line change
Expand Up @@ -116,13 +116,14 @@
<dcterms:format>JSON</dcterms:format>
<dcterms:title>Resource 1-2</dcterms:title>
<dcat:accessURL>http://data.test.org/datasets/1/resources/2/file.json</dcat:accessURL>
<dct:conformsTo rdf:resource="https://schema.data.gouv.fr/schemas/etalab/schema-irve-statique/1337.42.0/schema-statique.json" />
</dcat:Distribution>
<dcat:Distribution rdf:about="http://data.test.org/datasets/2/resources/1">
<dcat:accessURL>http://data.test.org/datasets/2/resources/1/file.json</dcat:accessURL>
<dcterms:title>Resource 2-1</dcterms:title>
<dcterms:format>JSON</dcterms:format>
<dcterms:description>A JSON resource</dcterms:description>
<dct:conformsTo rdf:resource="https://schema.data.gouv.fr/schemas/etalab/schema-irve-statique/2.2.1/schema-statique.json" />
<dct:conformsTo rdf:resource="https://example.org/schema.json" />
</dcat:Distribution>
<dcat:Distribution rdf:about="datasets/3/resources/1">
<dcterms:format>JSON</dcterms:format>
Expand All @@ -132,8 +133,8 @@
<dct:conformsTo rdf:resource="https://schema.data.gouv.fr/schemas/etalab/schema-irve-statique/2.2.0/schema-statique.json" />
</dcat:Distribution>
<dcterms:Location rdf:about="http://wuEurope.com/"/>
<dcterms:Standard rdf:about="https://schema.data.gouv.fr/schemas/etalab/schema-irve-statique/2.2.1/schema-statique.json">
<dcterms:title>Infrastructures de recharges pour véhicules électriques (IRVE)</dcterms:title>
<dcterms:Standard rdf:about="https://example.org/schema.json">
<dcterms:title>Example Schema</dcterms:title>
<dcterms:issued rdf:datatype="http://www.w3.org/2001/XMLSchema#dateTime">2024-01-23T18:59:02.737480</dcterms:issued>
</dcterms:Standard>
<foaf:Organization rdf:about="http://data.test.org/organizations/1">
Expand Down
22 changes: 17 additions & 5 deletions udata/harvest/tests/test_dcat_backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -136,7 +136,10 @@ def test_flat_with_blank_nodes(self, rmock):
assert datasets['1'].resources[0].format == 'json'
assert datasets['1'].resources[0].mime == 'application/json'

@pytest.mark.options(SCHEMA_CATALOG_URL='https://example.com/schemas')
def test_flat_with_blank_nodes_xml(self, rmock):
rmock.get('https://example.com/schemas', json=ResourceSchemaMockData.get_mock_data())

filename = 'bnodes.xml'
url = mock_dcat(rmock, filename)
org = OrganizationFactory()
Expand Down Expand Up @@ -168,21 +171,30 @@ def test_harvest_schemas(self, rmock):

datasets = {d.harvest.dct_identifier: d for d in Dataset.objects}

assert datasets['3'].schema == None
assert datasets['1'].schema == None
resources_by_title = { resource['title']: resource for resource in datasets['1'].resources }

# Schema with wrong version are considered as external. Maybe we could change this in the future
assert resources_by_title['Resource 1-2'].schema.url == 'https://schema.data.gouv.fr/schemas/etalab/schema-irve-statique/1337.42.0/schema-statique.json'
assert resources_by_title['Resource 1-2'].schema.name == None
assert resources_by_title['Resource 1-2'].schema.version == None

assert datasets['2'].schema.name == None
assert datasets['2'].schema.url == 'https://www.ecologie.gouv.fr/sites/default/files/R%C3%A9glementation%20IRVE.pdf'

resources_by_title = { resource['title']: resource for resource in datasets['2'].resources }

assert resources_by_title['Resource 2-1'].schema.name == 'Infrastructures de recharges pour véhicules électriques (IRVE)'
assert resources_by_title['Resource 2-1'].schema.url == 'https://schema.data.gouv.fr/schemas/etalab/schema-irve-statique/2.2.1/schema-statique.json'
# Unknown schema are kept as they were provided
assert resources_by_title['Resource 2-1'].schema.name == 'Example Schema'
assert resources_by_title['Resource 2-1'].schema.url == 'https://example.org/schema.json'
assert resources_by_title['Resource 2-1'].schema.version == None

assert resources_by_title['Resource 2-2'].schema == None

assert datasets['3'].schema == None
resources_by_title = { resource['title']: resource for resource in datasets['3'].resources }

# If there is only an URL and it matches a known schema inside the catalog, only set the name and the version (discard the URL)
# If there is just the URL, and it matches a known schema inside the catalog, only set the name and the version
# (discard the URL)
assert resources_by_title['Resource 3-1'].schema.name == 'etalab/schema-irve-statique'
assert resources_by_title['Resource 3-1'].schema.url == None
assert resources_by_title['Resource 3-1'].schema.version == '2.2.0'
Expand Down
9 changes: 9 additions & 0 deletions udata/models/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,14 @@ def resolve_model(self, model):
raise ValueError(message)


class FieldValidationError(ValidationError):
field: str

def __init__(self, *args, field: str, **kwargs):
self.field = field
super().__init__(*args, **kwargs)


db = UDataMongoEngine()
session_interface = MongoEngineSessionInterface(db)

Expand Down Expand Up @@ -112,6 +120,7 @@ def resolve_model(self, model):
MONGODB_DEPRECATED_MSG = '{0} is deprecated, use the MONGODB_HOST url syntax'



def validate_config(config):
for setting in MONGODB_DEPRECATED_SETTINGS:
if setting in config:
Expand Down
19 changes: 3 additions & 16 deletions udata/rdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -231,26 +231,13 @@ def schema_from_rdf(rdf):

schema = Schema()
if isinstance(resource, (URIRef, Literal)):
url = resource.toPython()
schema.url = resource.toPython()
elif isinstance(resource, RdfResource):
url = resource.identifier.toPython()
name = resource.value(DCT.title)
schema.url = resource.identifier.toPython()
schema.name = resource.value(DCT.title)
else:
return None

if url and name:
schema.url = url
schema.name = name
elif url:
# If the URL exists inside our schema catalog we want to only set
# the name of the schema, not the URL (and the version if it exists).
info = ResourceSchema.get_existing_schema_info_by_url(url)
if info:
schema.name = info[0]
schema.version = info[1]
else:
schema.url = url

return schema

def escape_xml_illegal_chars(val, replacement='?'):
Expand Down
32 changes: 16 additions & 16 deletions udata/tests/api/test_datasets_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -156,10 +156,10 @@ def test_dataset_api_list_with_filters(self):
org_dataset = VisibleDatasetFactory(organization=org)

schema_dataset = VisibleDatasetFactory(resources=[
ResourceFactory(schema={'name': 'my-schema', 'version': '1.0.0'})
ResourceFactory(schema={'name': 'my-schema', 'url': 'https://example.org', 'version': '1.0.0'})
])
schema_version2_dataset = VisibleDatasetFactory(resources=[
ResourceFactory(schema={'name': 'other-schema', 'version': '2.0.0'})
ResourceFactory(schema={'name': 'other-schema', 'url': 'https://example.org', 'version': '2.0.0'})
])

# filter on tag
Expand Down Expand Up @@ -728,17 +728,6 @@ def test_dataset_new_resource_with_schema(self, rmock):
data = dataset.to_dict()
resource_data = ResourceFactory.as_dict()

resource_data['schema'] = {
'name': 'my-schema',
'version': '1.0.0',
'url': 'http://example.com'
}
data['resources'].append(resource_data)
response = self.put(url_for('api.dataset', dataset=dataset), data)
self.assert400(response)
assert response.json['errors']['resources'][0]['schema']['url'] == [_('Having both name and URL is not allowed.')]
assert response.json['errors']['resources'][0]['schema']['name'] == [_('Having both name and URL is not allowed.')]

resource_data['schema'] = {'url': 'test'}
data['resources'].append(resource_data)
response = self.put(url_for('api.dataset', dataset=dataset), data)
Expand All @@ -755,15 +744,16 @@ def test_dataset_new_resource_with_schema(self, rmock):
data['resources'].append(resource_data)
response = self.put(url_for('api.dataset', dataset=dataset), data)
self.assert400(response)
assert response.json['errors']['resources'][0]['schema']['name'] == [_('Version "{version}" is not an allowed value for the schema "{name}". Allowed versions: {values}').format(version='42.0.0', name='etalab/schema-irve-statique', values='2.2.0, 2.2.1, latest')]
assert response.json['errors']['resources'][0]['schema']['version'] == [_('Version "{version}" is not an allowed value for the schema "{name}". Allowed versions: {values}').format(version='42.0.0', name='etalab/schema-irve-statique', values='2.2.0, 2.2.1, latest')]

resource_data['schema'] = {'url': 'http://example.com'}
resource_data['schema'] = {'url': 'http://example.com', 'name': 'etalab/schema-irve-statique'}
data['resources'].append(resource_data)
response = self.put(url_for('api.dataset', dataset=dataset), data)
self.assert200(response)
dataset.reload()
assert dataset.resources[0].schema['url'] == 'http://example.com'
assert dataset.resources[0].schema['name'] == None
assert dataset.resources[0].schema['name'] == 'etalab/schema-irve-statique'
assert dataset.resources[0].schema['version'] == None

resource_data['schema'] = {'name': 'etalab/schema-irve-statique'}
data['resources'].append(resource_data)
Expand All @@ -785,6 +775,16 @@ def test_dataset_new_resource_with_schema(self, rmock):
assert dataset.resources[0].schema['url'] == None
assert dataset.resources[0].schema['version'] == '2.2.0'

resource_data['schema'] = {'url': 'https://schema.data.gouv.fr/schemas/etalab/schema-irve-statique/2.2.1/schema-statique.json'}
data['resources'].append(resource_data)
response = self.put(url_for('api.dataset', dataset=dataset), data)
self.assert200(response)

dataset.reload()
assert dataset.resources[0].schema['name'] == 'etalab/schema-irve-statique'
assert dataset.resources[0].schema['url'] == None
assert dataset.resources[0].schema['version'] == '2.2.1'


class DatasetBadgeAPITest(APITestCase):
@classmethod
Expand Down
Loading

0 comments on commit 3340382

Please sign in to comment.