From 893b4ffd2fc437191ce96110806acd5185e0a941 Mon Sep 17 00:00:00 2001 From: Francis Charette Migneault Date: Wed, 24 Apr 2024 15:53:37 -0400 Subject: [PATCH] fixes to validate bands under mlm:input and their corresponding references by other extensions --- CHANGELOG.md | 18 +- README.md | 4 +- examples/item_eo_and_raster_bands.json | 573 +++++++++++++++++++++++++ examples/item_eo_bands.json | 132 ------ json-schema/schema.json | 24 +- pyproject.toml | 26 ++ tests/conftest.py | 7 +- tests/test_schema.py | 1 + 8 files changed, 628 insertions(+), 157 deletions(-) create mode 100644 examples/item_eo_and_raster_bands.json diff --git a/CHANGELOG.md b/CHANGELOG.md index 0b30f5a..ff5edca 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,24 +5,30 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). -## [Unreleased](https://github.com/crim-ca/dlm-extension/tree/main) +## [Unreleased](https://github.com/crim-ca/mlm-extension/tree/main) ### Added -- n/a +- Add pattern for `mlm:framework`, needing at least one alphanumeric character, + without leading or trailing non-alphanumeric characters. +- Add [`examples/item_eo_and_raster_bands.json`](examples/item_eo_and_raster_bands.json) demonstrating the original + use case represented by the previous [`examples/item_eo_bands.json`](examples/item_eo_bands.json) contents. ### Changed -- n/a +- Adjust `scikit-learn` and `Hugging Face` framework names to match the format employed by the official documentation. ### Deprecated - n/a ### Removed -- n/a +- Removed combination of `mlm:input` with `bands: null` that could never occur due to pre-requirement of `type: array`. ### Fixed -- n/a +- Fix `AnyBands` definition and use in the JSON schema to better consider possible use cases with `eo` extension. +- Fix [`examples/item_eo_bands.json`](examples/item_eo_bands.json) that was incorrectly also using `raster` extension. + This is not fundamentally wrong, but it did not allow to validate the `eo` extension use case properly, since + the `raster:bands` reference caused a bypass for the `mlm:input[*].bands` to succeed validation. -## [0.1.1.alpha4](https://github.com/crim-ca/dlm-extension/tree/0.1.1.alpha4) +## [v1.0.0](https://github.com/crim-ca/mlm-extension/tree/v1.0.0) ### Added - more [Task Enum](README.md#task-enum) tasks diff --git a/README.md b/README.md index b033a38..19821ef 100644 --- a/README.md +++ b/README.md @@ -169,8 +169,8 @@ to use common names when applicable. Below are a few notable entries. - `PyTorch` - `TensorFlow` -- `Scikit-learn` -- `Huggingface` +- `scikit-learn` +- `Hugging Face` - `Keras` - `ONNX` - `rgee` diff --git a/examples/item_eo_and_raster_bands.json b/examples/item_eo_and_raster_bands.json new file mode 100644 index 0000000..8a13cc9 --- /dev/null +++ b/examples/item_eo_and_raster_bands.json @@ -0,0 +1,573 @@ +{ + "stac_version": "1.0.0", + "stac_extensions": [ + "https://crim-ca.github.io/mlm-extension/v1.0.0/schema.json", + "https://stac-extensions.github.io/eo/v1.1.0/schema.json", + "https://stac-extensions.github.io/raster/v1.1.0/schema.json", + "https://stac-extensions.github.io/file/v1.0.0/schema.json", + "https://stac-extensions.github.io/ml-aoi/v0.2.0/schema.json" + ], + "type": "Feature", + "id": "resnet-18_sentinel-2_all_moco_classification", + "collection": "ml-model-examples", + "geometry": { + "type": "Polygon", + "coordinates": [ + [ + [ + -7.882190080512502, + 37.13739173208318 + ], + [ + -7.882190080512502, + 58.21798141355221 + ], + [ + 27.911651652899923, + 58.21798141355221 + ], + [ + 27.911651652899923, + 37.13739173208318 + ], + [ + -7.882190080512502, + 37.13739173208318 + ] + ] + ] + }, + "bbox": [ + -7.882190080512502, + 37.13739173208318, + 27.911651652899923, + 58.21798141355221 + ], + "properties": { + "description": "Sourced from torchgeo python library, identifier is ResNet18_Weights.SENTINEL2_ALL_MOCO", + "datetime": null, + "start_datetime": "1900-01-01T00:00:00Z", + "end_datetime": "9999-12-31T23:59:59Z", + "mlm:name": "Resnet-18 Sentinel-2 ALL MOCO", + "mlm:tasks": [ + "classification" + ], + "mlm:architecture": "ResNet", + "mlm:framework": "pytorch", + "mlm:framework_version": "2.1.2+cu121", + "file:size": 43000000, + "mlm:memory_size": 1, + "mlm:total_parameters": 11700000, + "mlm:pretrained_source": "EuroSat Sentinel-2", + "mlm:accelerator": "cuda", + "mlm:accelerator_constrained": false, + "mlm:accelerator_summary": "Unknown", + "mlm:batch_size_suggestion": 256, + "mlm:input": [ + { + "name": "13 Band Sentinel-2 Batch", + "bands": [ + "B01", + "B02", + "B03", + "B04", + "B05", + "B06", + "B07", + "B08", + "B8A", + "B09", + "B10", + "B11", + "B12" + ], + "input": { + "shape": [ + -1, + 13, + 64, + 64 + ], + "dim_order": [ + "batch", + "channel", + "height", + "width" + ], + "data_type": "float32" + }, + "norm_by_channel": true, + "norm_type": "z-score", + "resize_type": null, + "statistics": [ + { + "mean": 1354.40546513, + "stddev": 245.71762908 + }, + { + "mean": 1118.24399958, + "stddev": 333.00778264 + }, + { + "mean": 1042.92983953, + "stddev": 395.09249139 + }, + { + "mean": 947.62620298, + "stddev": 593.75055589 + }, + { + "mean": 1199.47283961, + "stddev": 566.4170017 + }, + { + "mean": 1999.79090914, + "stddev": 861.18399006 + }, + { + "mean": 2369.22292565, + "stddev": 1086.63139075 + }, + { + "mean": 2296.82608323, + "stddev": 1117.98170791 + }, + { + "mean": 732.08340178, + "stddev": 404.91978886 + }, + { + "mean": 12.11327804, + "stddev": 4.77584468 + }, + { + "mean": 1819.01027855, + "stddev": 1002.58768311 + }, + { + "mean": 1118.92391149, + "stddev": 761.30323499 + }, + { + "mean": 2594.14080798, + "stddev": 1231.58581042 + } + ], + "pre_processing_function": { + "format": "python", + "expression": "torchgeo.datamodules.eurosat.EuroSATDataModule.collate_fn" + } + } + ], + "mlm:output": [ + { + "name": "classification", + "tasks": [ + "classification" + ], + "result": { + "shape": [ + -1, + 10 + ], + "dim_order": [ + "batch", + "class" + ], + "data_type": "float32" + }, + "classification_classes": [ + { + "value": 0, + "name": "Annual Crop", + "description": null, + "title": null, + "color_hint": null, + "nodata": false + }, + { + "value": 1, + "name": "Forest", + "description": null, + "title": null, + "color_hint": null, + "nodata": false + }, + { + "value": 2, + "name": "Herbaceous Vegetation", + "description": null, + "title": null, + "color_hint": null, + "nodata": false + }, + { + "value": 3, + "name": "Highway", + "description": null, + "title": null, + "color_hint": null, + "nodata": false + }, + { + "value": 4, + "name": "Industrial Buildings", + "description": null, + "title": null, + "color_hint": null, + "nodata": false + }, + { + "value": 5, + "name": "Pasture", + "description": null, + "title": null, + "color_hint": null, + "nodata": false + }, + { + "value": 6, + "name": "Permanent Crop", + "description": null, + "title": null, + "color_hint": null, + "nodata": false + }, + { + "value": 7, + "name": "Residential Buildings", + "description": null, + "title": null, + "color_hint": null, + "nodata": false + }, + { + "value": 8, + "name": "River", + "description": null, + "title": null, + "color_hint": null, + "nodata": false + }, + { + "value": 9, + "name": "SeaLake", + "description": null, + "title": null, + "color_hint": null, + "nodata": false + } + ], + "post_processing_function": null + } + ], + "eo:bands": [ + { + "name": "B01", + "common_name": "coastal", + "description": "Coastal aerosol (band 1)", + "center_wavelength": 0.443, + "full_width_half_max": 0.027 + }, + { + "name": "B02", + "common_name": "blue", + "description": "Blue (band 2)", + "center_wavelength": 0.49, + "full_width_half_max": 0.098 + }, + { + "name": "B03", + "common_name": "green", + "description": "Green (band 3)", + "center_wavelength": 0.56, + "full_width_half_max": 0.045 + }, + { + "name": "B04", + "common_name": "red", + "description": "Red (band 4)", + "center_wavelength": 0.665, + "full_width_half_max": 0.038 + }, + { + "name": "B05", + "common_name": "rededge", + "description": "Red edge 1 (band 5)", + "center_wavelength": 0.704, + "full_width_half_max": 0.019 + }, + { + "name": "B06", + "common_name": "rededge", + "description": "Red edge 2 (band 6)", + "center_wavelength": 0.74, + "full_width_half_max": 0.018 + }, + { + "name": "B07", + "common_name": "rededge", + "description": "Red edge 3 (band 7)", + "center_wavelength": 0.783, + "full_width_half_max": 0.028 + }, + { + "name": "B08", + "common_name": "nir", + "description": "NIR 1 (band 8)", + "center_wavelength": 0.842, + "full_width_half_max": 0.145 + }, + { + "name": "B8A", + "common_name": "nir08", + "description": "NIR 2 (band 8A)", + "center_wavelength": 0.865, + "full_width_half_max": 0.033 + }, + { + "name": "B09", + "common_name": "nir09", + "description": "NIR 3 (band 9)", + "center_wavelength": 0.945, + "full_width_half_max": 0.026 + }, + { + "name": "B10", + "common_name": "cirrus", + "description": "SWIR - Cirrus (band 10)", + "center_wavelength": 1.375, + "full_width_half_max": 0.026 + }, + { + "name": "B11", + "common_name": "swir16", + "description": "SWIR 1 (band 11)", + "center_wavelength": 1.61, + "full_width_half_max": 0.143 + }, + { + "name": "B12", + "common_name": "swir22", + "description": "SWIR 2 (band 12)", + "center_wavelength": 2.19, + "full_width_half_max": 0.242 + } + ], + "raster:bands": [ + { + "name": "B01", + "nodata": 0, + "data_type": "uint16", + "bits_per_sample": 15, + "spatial_resolution": 60, + "scale": 0.0001, + "offset": 0, + "unit": "m" + }, + { + "name": "B02", + "nodata": 0, + "data_type": "uint16", + "bits_per_sample": 15, + "spatial_resolution": 10, + "scale": 0.0001, + "offset": 0, + "unit": "m" + }, + { + "name": "B03", + "nodata": 0, + "data_type": "uint16", + "bits_per_sample": 15, + "spatial_resolution": 10, + "scale": 0.0001, + "offset": 0, + "unit": "m" + }, + { + "name": "B04", + "nodata": 0, + "data_type": "uint16", + "bits_per_sample": 15, + "spatial_resolution": 10, + "scale": 0.0001, + "offset": 0, + "unit": "m" + }, + { + "name": "B05", + "nodata": 0, + "data_type": "uint16", + "bits_per_sample": 15, + "spatial_resolution": 20, + "scale": 0.0001, + "offset": 0, + "unit": "m" + }, + { + "name": "B06", + "nodata": 0, + "data_type": "uint16", + "bits_per_sample": 15, + "spatial_resolution": 20, + "scale": 0.0001, + "offset": 0, + "unit": "m" + }, + { + "name": "B07", + "nodata": 0, + "data_type": "uint16", + "bits_per_sample": 15, + "spatial_resolution": 20, + "scale": 0.0001, + "offset": 0, + "unit": "m" + }, + { + "name": "B08", + "nodata": 0, + "data_type": "uint16", + "bits_per_sample": 15, + "spatial_resolution": 10, + "scale": 0.0001, + "offset": 0, + "unit": "m" + }, + { + "name": "B8A", + "nodata": 0, + "data_type": "uint16", + "bits_per_sample": 15, + "spatial_resolution": 20, + "scale": 0.0001, + "offset": 0, + "unit": "m" + }, + { + "name": "B09", + "nodata": 0, + "data_type": "uint16", + "bits_per_sample": 15, + "spatial_resolution": 60, + "scale": 0.0001, + "offset": 0, + "unit": "m" + }, + { + "name": "B10", + "nodata": 0, + "data_type": "uint16", + "bits_per_sample": 15, + "spatial_resolution": 60, + "scale": 0.0001, + "offset": 0, + "unit": "m" + }, + { + "name": "B11", + "nodata": 0, + "data_type": "uint16", + "bits_per_sample": 15, + "spatial_resolution": 20, + "scale": 0.0001, + "offset": 0, + "unit": "m" + }, + { + "name": "B12", + "nodata": 0, + "data_type": "uint16", + "bits_per_sample": 15, + "spatial_resolution": 20, + "scale": 0.0001, + "offset": 0, + "unit": "m" + } + ] + }, + "assets": { + "weights": { + "href": "https://huggingface.co/torchgeo/resnet18_sentinel2_all_moco/resolve/main/resnet18_sentinel2_all_moco-59bfdff9.pth", + "title": "Pytorch weights checkpoint", + "description": "A Resnet-18 classification model trained on normalized Sentinel-2 imagery with Eurosat landcover labels with torchgeo", + "type": "application/octet-stream; application=pytorch", + "roles": [ + "mlm:model", + "mlm:weights" + ], + "$comment": "Following 'eo:bands' is required to fulfil schema validation of 'eo' extension.", + "eo:bands": [ + { + "name": "coastal" + }, + { + "name": "blue" + }, + { + "name": "green" + }, + { + "name": "red" + }, + { + "name": "rededge1" + }, + { + "name": "rededge2" + }, + { + "name": "rededge3" + }, + { + "name": "nir" + }, + { + "name": "nir08" + }, + { + "name": "nir09" + }, + { + "name": "cirrus" + }, + { + "name": "swir16" + }, + { + "name": "swir22" + } + ] + }, + "source_code": { + "href": "https://github.com/microsoft/torchgeo/blob/61efd2e2c4df7ebe3bd03002ebbaeaa3cfe9885a/torchgeo/models/resnet.py#L207", + "title": "Model implementation.", + "description": "Source code to run the model.", + "type": "text/x-python", + "roles": [ + "mlm:model", + "code", + "metadata" + ] + } + }, + "links": [ + { + "rel": "collection", + "href": "./collection.json", + "type": "application/json" + }, + { + "rel": "self", + "href": "./item_eo_bands.json", + "type": "application/geo+json" + }, + { + "rel": "derived_from", + "href": "https://earth-search.aws.element84.com/v1/collections/sentinel-2-l2a", + "type": "application/json", + "ml-aoi:split": "train" + } + ] +} diff --git a/examples/item_eo_bands.json b/examples/item_eo_bands.json index 8a13cc9..800ecde 100644 --- a/examples/item_eo_bands.json +++ b/examples/item_eo_bands.json @@ -353,138 +353,6 @@ "center_wavelength": 2.19, "full_width_half_max": 0.242 } - ], - "raster:bands": [ - { - "name": "B01", - "nodata": 0, - "data_type": "uint16", - "bits_per_sample": 15, - "spatial_resolution": 60, - "scale": 0.0001, - "offset": 0, - "unit": "m" - }, - { - "name": "B02", - "nodata": 0, - "data_type": "uint16", - "bits_per_sample": 15, - "spatial_resolution": 10, - "scale": 0.0001, - "offset": 0, - "unit": "m" - }, - { - "name": "B03", - "nodata": 0, - "data_type": "uint16", - "bits_per_sample": 15, - "spatial_resolution": 10, - "scale": 0.0001, - "offset": 0, - "unit": "m" - }, - { - "name": "B04", - "nodata": 0, - "data_type": "uint16", - "bits_per_sample": 15, - "spatial_resolution": 10, - "scale": 0.0001, - "offset": 0, - "unit": "m" - }, - { - "name": "B05", - "nodata": 0, - "data_type": "uint16", - "bits_per_sample": 15, - "spatial_resolution": 20, - "scale": 0.0001, - "offset": 0, - "unit": "m" - }, - { - "name": "B06", - "nodata": 0, - "data_type": "uint16", - "bits_per_sample": 15, - "spatial_resolution": 20, - "scale": 0.0001, - "offset": 0, - "unit": "m" - }, - { - "name": "B07", - "nodata": 0, - "data_type": "uint16", - "bits_per_sample": 15, - "spatial_resolution": 20, - "scale": 0.0001, - "offset": 0, - "unit": "m" - }, - { - "name": "B08", - "nodata": 0, - "data_type": "uint16", - "bits_per_sample": 15, - "spatial_resolution": 10, - "scale": 0.0001, - "offset": 0, - "unit": "m" - }, - { - "name": "B8A", - "nodata": 0, - "data_type": "uint16", - "bits_per_sample": 15, - "spatial_resolution": 20, - "scale": 0.0001, - "offset": 0, - "unit": "m" - }, - { - "name": "B09", - "nodata": 0, - "data_type": "uint16", - "bits_per_sample": 15, - "spatial_resolution": 60, - "scale": 0.0001, - "offset": 0, - "unit": "m" - }, - { - "name": "B10", - "nodata": 0, - "data_type": "uint16", - "bits_per_sample": 15, - "spatial_resolution": 60, - "scale": 0.0001, - "offset": 0, - "unit": "m" - }, - { - "name": "B11", - "nodata": 0, - "data_type": "uint16", - "bits_per_sample": 15, - "spatial_resolution": 20, - "scale": 0.0001, - "offset": 0, - "unit": "m" - }, - { - "name": "B12", - "nodata": 0, - "data_type": "uint16", - "bits_per_sample": 15, - "spatial_resolution": 20, - "scale": 0.0001, - "offset": 0, - "unit": "m" - } ] }, "assets": { diff --git a/json-schema/schema.json b/json-schema/schema.json index 2f55335..ed39c70 100644 --- a/json-schema/schema.json +++ b/json-schema/schema.json @@ -230,8 +230,8 @@ "enum": [ "PyTorch", "TensorFlow", - "Scikit-learn", - "Huggingface", + "scikit-learn", + "Hugging Face", "Keras", "ONNX", "rgee", @@ -246,6 +246,7 @@ { "type": "string", "minLength": 1, + "pattern": "^(?=[^\\s._\\-]).*[^\\s._\\-]$", "description": "Any other framework name to allow extension. Enum names should be preferred when possible to allow better portability." } ] @@ -632,12 +633,12 @@ }, { "$comment": "However, if any band is indicated, a 'bands'-compliant section should describe them.", - "FIXME_$ref": "#/$defs/AnyBandsRef" + "$ref": "#/$defs/AnyBandsRef" } ] }, "AnyBandsRef": { - "$comment": "This definition ensures that, if at least 1 named MLM 'bands' is provided, at least 1 of the supported references from EO, Raster or STAC Core 1.1 are provided as well.", + "$comment": "This definition ensures that, if at least 1 named MLM input 'bands' is provided, at least 1 of the supported references from EO, Raster or STAC Core 1.1 are provided as well. Otherwise, 'bands' must be explicitly empty.", "if": { "$comment": "This is the JSON-object 'properties' definition.", "properties": { @@ -659,10 +660,11 @@ "properties": { "bands": { "type": "array", + "$comment": "This 'minItems' is the purpose of this whole 'if/then' block.", "minItems": 1, "items": { "type": "string", - "$comment": "This 'minItems' is the purpose of this whole 'if/then' block." + "minLength": 1 } } } @@ -775,15 +777,9 @@ "$comment": "This is the 'Model Input Object' properties.", "properties": { "bands": { - "oneOf": [ - { - "type": "null" - }, - { - "type": "array", - "maxItems": 0 - } - ] + "$comment": "No bands reference provided, therefore none permitted in model inputs.", + "type": "array", + "maxItems": 0 } } } diff --git a/pyproject.toml b/pyproject.toml index 96fe567..6e11424 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -117,6 +117,32 @@ glob = "**/*.py" search = "https://crim-ca.github.io/mlm-extension/v{current_version}/schema.json" replace = "https://crim-ca.github.io/mlm-extension/v{new_version}/schema.json" +[[tool.bumpversion.files]] +filename = "CHANGELOG.md" +search = """ +## [Unreleased](https://github.com/crim-ca/mlm-extension/tree/main) +""" +replace = """ +## [Unreleased](https://github.com/crim-ca/mlm-extension/tree/main) + +### Added +- n/a + +### Changed +- n/a + +### Deprecated +- n/a + +### Removed +- n/a + +### Fixed +- n/a + +## [v{new_version}](https://github.com/crim-ca/mlm-extension/tree/v{new_version}) +""" + [tool.ruff] exclude = [ ".git", diff --git a/tests/conftest.py b/tests/conftest.py index 0092fe0..996b1a6 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -30,12 +30,13 @@ def mlm_validator( mlm_schema: Dict[str, Any], ) -> pystac.validation.stac_validator.JsonSchemaSTACValidator: """ - Update the :class:`pystac.validation.RegisteredValidator` with the local ML-AOI JSON schema definition. + Update the :class:`pystac.validation.RegisteredValidator` with the local MLM JSON schema definition. Because the schema is *not yet* uploaded to the expected STAC schema URI, any call to :func:`pystac.validation.validate` or :meth:`pystac.stac_object.STACObject.validate` results - in ``GetSchemaError`` when the schema retrieval is attempted by the validator. By adding the schema to the - mapping beforehand, remote resolution can be bypassed temporarily. + in ``GetSchemaError`` when the schema retrieval is attempted by the validator.By adding the schema to the + mapping beforehand, remote resolution can be bypassed temporarily. When evaluating modifications to the + current schema, this also ensures that local changes are used instead of the remote reference. """ validator = pystac.validation.RegisteredValidator.get_validator() validator = cast(pystac.validation.stac_validator.JsonSchemaSTACValidator, validator) diff --git a/tests/test_schema.py b/tests/test_schema.py index a3e9899..3fb2b6b 100644 --- a/tests/test_schema.py +++ b/tests/test_schema.py @@ -14,6 +14,7 @@ "item_basic.json", "item_raster_bands.json", "item_eo_bands.json", + "item_eo_and_raster_bands.json", "item_multi_io.json", ], indirect=True,