From f01a8f7b3e1a88db40539e50e6a975ea9a3be5b9 Mon Sep 17 00:00:00 2001 From: chorng Date: Tue, 7 May 2024 18:58:21 +0200 Subject: [PATCH] Refactor collection scripts * break create collection into small functions * fix typing of the create collection function * add ItemCreationError and CollectionCreationError to wrap all possible errors for logging purpose --- create_vpp_collection.py | 15 ++++- scripts/vpp/collection.py | 114 +++++++++++++++++++++++++------------- scripts/vpp/item.py | 69 ++++++++++++----------- 3 files changed, 124 insertions(+), 74 deletions(-) diff --git a/create_vpp_collection.py b/create_vpp_collection.py index 28779d3..9b8cd74 100644 --- a/create_vpp_collection.py +++ b/create_vpp_collection.py @@ -1,9 +1,18 @@ import logging +from glob import glob -from scripts.vpp.collection import create_collection +from scripts.vpp.collection import create_vpp_collection, get_stac_validator from scripts.vpp.constants import COLLECTION_ID, STAC_DIR, WORKING_DIR LOGGER = logging.getLogger(__name__) -if __name__ == "__main__": + + +def main(): logging.basicConfig(filename="create_vpp_collection.log") - create_collection(f"{WORKING_DIR}/{STAC_DIR}/{COLLECTION_ID}/**/VPP*.json") + item_list = glob(f"{WORKING_DIR}/{STAC_DIR}/{COLLECTION_ID}/**/*.json") + validator = get_stac_validator("schema/products/vpp.json") + create_vpp_collection(item_list, validator) + + +if __name__ == "__main__": + main() diff --git a/scripts/vpp/collection.py b/scripts/vpp/collection.py index 80640e0..859d776 100644 --- a/scripts/vpp/collection.py +++ b/scripts/vpp/collection.py @@ -3,7 +3,6 @@ import json import logging import os -from glob import glob import pystac import pystac.extensions @@ -12,6 +11,7 @@ from jsonschema.exceptions import best_match from pystac.extensions.item_assets import AssetDefinition, ItemAssetsExtension from pystac.extensions.projection import ProjectionExtension +from pystac.link import Link from referencing import Registry, Resource from .constants import ( @@ -31,6 +31,10 @@ LOGGER = logging.getLogger(__name__) +class CollectionCreationError(Exception): + pass + + def get_stac_validator(product_schema: str) -> Draft7Validator: with open(product_schema, encoding="utf-8") as f: schema = json.load(f) @@ -40,8 +44,8 @@ def get_stac_validator(product_schema: str) -> Draft7Validator: return Draft7Validator({"$ref": "http://example.com/schema.json"}, registry=registry) -def create_collection(item_list: list[str]) -> pystac.Collection: - collection = pystac.Collection( +def create_core_collection() -> pystac.Collection: + return pystac.Collection( id=COLLECTION_ID, description=COLLECTION_DESCRIPTION, extent=COLLECTION_EXTENT, @@ -50,54 +54,84 @@ def create_collection(item_list: list[str]) -> pystac.Collection: providers=[VPP_HOST_AND_LICENSOR, VPP_PRODUCER_AND_PROCESSOR], ) - # summaries + +def add_summaries_to_collection(collection: pystac.Collection, epsg_list: list[int]) -> None: summaries = ProjectionExtension.summaries(collection, add_if_missing=True) - summaries.epsg = [ - 32620, - 32621, - 32622, - 32625, - 32626, - 32627, - 32628, - 32629, - 32630, - 32631, - 32632, - 32633, - 32634, - 32635, - 32636, - 32637, - 32638, - 32738, - 32740, - ] - - # extensions + summaries.epsg = epsg_list + + +def add_item_assets_to_collection(collection: pystac.Collection, asset_title_map: dict[str, str]) -> None: item_assets = ItemAssetsExtension.ext(collection, add_if_missing=True) item_assets.item_assets = { - key: AssetDefinition({"title": TITLE_MAP[key], "media_type": pystac.MediaType.GEOTIFF, "roles": ["data"]}) - for key in TITLE_MAP + key: AssetDefinition({"title": asset_title_map[key], "media_type": pystac.MediaType.GEOTIFF, "roles": ["data"]}) + for key in asset_title_map } - # links - collection.links.append(CLMS_LICENSE) - # add items - items = glob(item_list) - for item in items: +def add_links_to_collection(collection: pystac.Collection, link_list: list[Link]) -> None: + for link in link_list: + collection.links.append(link) + + +def add_items_to_collection(collection: pystac.Collection, item_list: list[str]) -> None: + for item in item_list: stac_object = pystac.read_file(item) collection.add_item(stac_object, title=stac_object.id) - collection.set_self_href(os.path.join(WORKING_DIR, f"{STAC_DIR}/{collection.id}/{collection.id}.json")) - catalog = pystac.read_file(f"{WORKING_DIR}/{STAC_DIR}/clms_catalog.json") - collection.set_root(catalog) - collection.set_parent(catalog) - validator = get_stac_validator("schema/products/vpp.json") + +def create_collection(item_list: list[str]) -> pystac.Collection: + try: + collection = create_core_collection() + + # summaries + epsg_list = [ + 32620, + 32621, + 32622, + 32625, + 32626, + 32627, + 32628, + 32629, + 32630, + 32631, + 32632, + 32633, + 32634, + 32635, + 32636, + 32637, + 32638, + 32738, + 32740, + ] + add_summaries_to_collection(collection, epsg_list) + + # extensions + add_item_assets_to_collection(collection, TITLE_MAP) + + # links + link_list = [CLMS_LICENSE] + add_links_to_collection(collection, link_list) + + # add items + add_items_to_collection(collection, item_list) + + # add self, root, and parent links + collection.set_self_href(os.path.join(WORKING_DIR, f"{STAC_DIR}/{collection.id}/{collection.id}.json")) + catalog = pystac.read_file(f"{WORKING_DIR}/{STAC_DIR}/clms_catalog.json") + collection.set_root(catalog) + collection.set_parent(catalog) + except Exception as error: + raise CollectionCreationError(error) + return collection + + +def create_vpp_collection(item_list: list[str], validator: Draft7Validator) -> None: try: + collection = create_collection(item_list) error_msg = best_match(validator.iter_errors(collection.to_dict())) assert error_msg is None, f"Failed to create {collection.id} collection. Reason: {error_msg}." collection.save_object() - except AssertionError as error: + except (AssertionError, CollectionCreationError) as error: LOGGER.error(error) diff --git a/scripts/vpp/item.py b/scripts/vpp/item.py index 04604f8..140620d 100644 --- a/scripts/vpp/item.py +++ b/scripts/vpp/item.py @@ -9,14 +9,12 @@ import boto3 import pystac import rasterio as rio -from botocore.exceptions import BotoCoreError from botocore.paginate import PageIterator from jsonschema import Draft7Validator from jsonschema.exceptions import best_match from pystac.extensions.projection import ProjectionExtension from rasterio.coords import BoundingBox from rasterio.crs import CRS -from rasterio.errors import RasterioIOError from rasterio.warp import transform_bounds from referencing import Registry, Resource from shapely.geometry import Polygon, box, mapping @@ -38,6 +36,10 @@ LOGGER = logging.getLogger(__name__) +class ItemCreationError(Exception): + pass + + def create_product_list(start_year: int, end_year: int) -> list[str]: product_list = [] for year in range(start_year, end_year + 1): @@ -49,7 +51,7 @@ def create_product_list(start_year: int, end_year: int) -> list[str]: def create_page_iterator(aws_session: boto3.Session, bucket: str, prefix: str) -> PageIterator: client = aws_session.client("s3") paginator = client.get_paginator("list_objects_v2") - return paginator.paginate(Bucket=bucket, Prefix=prefix, Delimiter="-", MaxKeys=10) + return paginator.paginate(Bucket=bucket, Prefix=prefix, Delimiter="-") def read_metadata_from_s3(bucket: str, key: str, aws_session: boto3.Session) -> tuple[BoundingBox, CRS, int, int]: @@ -140,33 +142,38 @@ def add_assets_to_item(item: pystac.Item, asset_dict: dict[str, pystac.Asset]) - def create_item(aws_session: boto3.Session, bucket: str, tile: str) -> pystac.Item: - client = aws_session.client("s3") - parameters = client.list_objects(Bucket=bucket, Prefix=tile, Delimiter=".")["CommonPrefixes"] - asset_keys = [parameter["Prefix"] + "tif" for parameter in parameters] - _, tail = os.path.split(asset_keys[0]) - product_id = "_".join((tail[:23], tail[29:31])) - bounds, crs, height, width, created = read_metadata_from_s3(bucket, asset_keys[0], aws_session) - geom_wgs84 = get_geom_wgs84(bounds, crs) - description = get_description(product_id) - start_datetime, end_datetime = get_datetime(product_id) - - # core metadata - item = create_core_item(product_id, geom_wgs84, start_datetime, end_datetime, created, description, COLLECTION_ID) - - # common metadata - provider_list = [VPP_HOST_AND_LICENSOR, VPP_PRODUCER_AND_PROCESSOR] - add_providers_to_item(item, provider_list) - - # extensions - add_projection_extension_to_item(item, crs, bounds, height, width) - - # links - link_list = [CLMS_LICENSE, CLMS_CATALOG_LINK, ITEM_PARENT_LINK, COLLECTION_LINK] - add_links_to_item(item, link_list) - - # assets - assets = {os.path.split(key)[-1][:-4].lower(): create_asset(key) for key in asset_keys} - add_assets_to_item(item, assets) + try: + client = aws_session.client("s3") + parameters = client.list_objects(Bucket=bucket, Prefix=tile, Delimiter=".")["CommonPrefixes"] + asset_keys = [parameter["Prefix"] + "tif" for parameter in parameters] + _, tail = os.path.split(asset_keys[0]) + product_id = "_".join((tail[:23], tail[29:31])) + bounds, crs, height, width, created = read_metadata_from_s3(bucket, asset_keys[0], aws_session) + geom_wgs84 = get_geom_wgs84(bounds, crs) + description = get_description(product_id) + start_datetime, end_datetime = get_datetime(product_id) + + # core metadata + item = create_core_item( + product_id, geom_wgs84, start_datetime, end_datetime, created, description, COLLECTION_ID + ) + + # common metadata + provider_list = [VPP_HOST_AND_LICENSOR, VPP_PRODUCER_AND_PROCESSOR] + add_providers_to_item(item, provider_list) + + # extensions + add_projection_extension_to_item(item, crs, bounds, height, width) + + # links + link_list = [CLMS_LICENSE, CLMS_CATALOG_LINK, ITEM_PARENT_LINK, COLLECTION_LINK] + add_links_to_item(item, link_list) + + # assets + assets = {os.path.split(key)[-1][:-4].lower(): create_asset(key) for key in asset_keys} + add_assets_to_item(item, assets) + except Exception as error: + raise ItemCreationError(error) return item @@ -186,5 +193,5 @@ def create_vpp_item(aws_session: boto3.Session, bucket: str, validator: Draft7Va error_msg = best_match(validator.iter_errors(item.to_dict())) assert error_msg is None, f"Failed to create {item.id} item. Reason: {error_msg}." item.save_object() - except (AssertionError, BotoCoreError, RasterioIOError) as error: + except (AssertionError, ItemCreationError) as error: LOGGER.error(error)