Skip to content

Commit

Permalink
Merge pull request #32 from eea/fix/vpp-collection-scripts
Browse files Browse the repository at this point in the history
Refactor collection scripts
  • Loading branch information
chorng authored May 17, 2024
2 parents 26922c9 + f01a8f7 commit d11641f
Show file tree
Hide file tree
Showing 3 changed files with 124 additions and 74 deletions.
15 changes: 12 additions & 3 deletions create_vpp_collection.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,18 @@
import logging
from glob import glob

from scripts.vpp.collection import create_collection
from scripts.vpp.collection import create_vpp_collection, get_stac_validator
from scripts.vpp.constants import COLLECTION_ID, STAC_DIR, WORKING_DIR

LOGGER = logging.getLogger(__name__)
if __name__ == "__main__":


def main():
logging.basicConfig(filename="create_vpp_collection.log")
create_collection(f"{WORKING_DIR}/{STAC_DIR}/{COLLECTION_ID}/**/VPP*.json")
item_list = glob(f"{WORKING_DIR}/{STAC_DIR}/{COLLECTION_ID}/**/*.json")
validator = get_stac_validator("schema/products/vpp.json")
create_vpp_collection(item_list, validator)


if __name__ == "__main__":
main()
114 changes: 74 additions & 40 deletions scripts/vpp/collection.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
import json
import logging
import os
from glob import glob

import pystac
import pystac.extensions
Expand All @@ -12,6 +11,7 @@
from jsonschema.exceptions import best_match
from pystac.extensions.item_assets import AssetDefinition, ItemAssetsExtension
from pystac.extensions.projection import ProjectionExtension
from pystac.link import Link
from referencing import Registry, Resource

from .constants import (
Expand All @@ -31,6 +31,10 @@
LOGGER = logging.getLogger(__name__)


class CollectionCreationError(Exception):
pass


def get_stac_validator(product_schema: str) -> Draft7Validator:
with open(product_schema, encoding="utf-8") as f:
schema = json.load(f)
Expand All @@ -40,8 +44,8 @@ def get_stac_validator(product_schema: str) -> Draft7Validator:
return Draft7Validator({"$ref": "http://example.com/schema.json"}, registry=registry)


def create_collection(item_list: list[str]) -> pystac.Collection:
collection = pystac.Collection(
def create_core_collection() -> pystac.Collection:
return pystac.Collection(
id=COLLECTION_ID,
description=COLLECTION_DESCRIPTION,
extent=COLLECTION_EXTENT,
Expand All @@ -50,54 +54,84 @@ def create_collection(item_list: list[str]) -> pystac.Collection:
providers=[VPP_HOST_AND_LICENSOR, VPP_PRODUCER_AND_PROCESSOR],
)

# summaries

def add_summaries_to_collection(collection: pystac.Collection, epsg_list: list[int]) -> None:
summaries = ProjectionExtension.summaries(collection, add_if_missing=True)
summaries.epsg = [
32620,
32621,
32622,
32625,
32626,
32627,
32628,
32629,
32630,
32631,
32632,
32633,
32634,
32635,
32636,
32637,
32638,
32738,
32740,
]

# extensions
summaries.epsg = epsg_list


def add_item_assets_to_collection(collection: pystac.Collection, asset_title_map: dict[str, str]) -> None:
item_assets = ItemAssetsExtension.ext(collection, add_if_missing=True)
item_assets.item_assets = {
key: AssetDefinition({"title": TITLE_MAP[key], "media_type": pystac.MediaType.GEOTIFF, "roles": ["data"]})
for key in TITLE_MAP
key: AssetDefinition({"title": asset_title_map[key], "media_type": pystac.MediaType.GEOTIFF, "roles": ["data"]})
for key in asset_title_map
}

# links
collection.links.append(CLMS_LICENSE)

# add items
items = glob(item_list)
for item in items:
def add_links_to_collection(collection: pystac.Collection, link_list: list[Link]) -> None:
for link in link_list:
collection.links.append(link)


def add_items_to_collection(collection: pystac.Collection, item_list: list[str]) -> None:
for item in item_list:
stac_object = pystac.read_file(item)
collection.add_item(stac_object, title=stac_object.id)

collection.set_self_href(os.path.join(WORKING_DIR, f"{STAC_DIR}/{collection.id}/{collection.id}.json"))
catalog = pystac.read_file(f"{WORKING_DIR}/{STAC_DIR}/clms_catalog.json")
collection.set_root(catalog)
collection.set_parent(catalog)
validator = get_stac_validator("schema/products/vpp.json")

def create_collection(item_list: list[str]) -> pystac.Collection:
try:
collection = create_core_collection()

# summaries
epsg_list = [
32620,
32621,
32622,
32625,
32626,
32627,
32628,
32629,
32630,
32631,
32632,
32633,
32634,
32635,
32636,
32637,
32638,
32738,
32740,
]
add_summaries_to_collection(collection, epsg_list)

# extensions
add_item_assets_to_collection(collection, TITLE_MAP)

# links
link_list = [CLMS_LICENSE]
add_links_to_collection(collection, link_list)

# add items
add_items_to_collection(collection, item_list)

# add self, root, and parent links
collection.set_self_href(os.path.join(WORKING_DIR, f"{STAC_DIR}/{collection.id}/{collection.id}.json"))
catalog = pystac.read_file(f"{WORKING_DIR}/{STAC_DIR}/clms_catalog.json")
collection.set_root(catalog)
collection.set_parent(catalog)
except Exception as error:
raise CollectionCreationError(error)
return collection


def create_vpp_collection(item_list: list[str], validator: Draft7Validator) -> None:
try:
collection = create_collection(item_list)
error_msg = best_match(validator.iter_errors(collection.to_dict()))
assert error_msg is None, f"Failed to create {collection.id} collection. Reason: {error_msg}."
collection.save_object()
except AssertionError as error:
except (AssertionError, CollectionCreationError) as error:
LOGGER.error(error)
69 changes: 38 additions & 31 deletions scripts/vpp/item.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,14 +9,12 @@
import boto3
import pystac
import rasterio as rio
from botocore.exceptions import BotoCoreError
from botocore.paginate import PageIterator
from jsonschema import Draft7Validator
from jsonschema.exceptions import best_match
from pystac.extensions.projection import ProjectionExtension
from rasterio.coords import BoundingBox
from rasterio.crs import CRS
from rasterio.errors import RasterioIOError
from rasterio.warp import transform_bounds
from referencing import Registry, Resource
from shapely.geometry import Polygon, box, mapping
Expand All @@ -38,6 +36,10 @@
LOGGER = logging.getLogger(__name__)


class ItemCreationError(Exception):
pass


def create_product_list(start_year: int, end_year: int) -> list[str]:
product_list = []
for year in range(start_year, end_year + 1):
Expand All @@ -49,7 +51,7 @@ def create_product_list(start_year: int, end_year: int) -> list[str]:
def create_page_iterator(aws_session: boto3.Session, bucket: str, prefix: str) -> PageIterator:
client = aws_session.client("s3")
paginator = client.get_paginator("list_objects_v2")
return paginator.paginate(Bucket=bucket, Prefix=prefix, Delimiter="-", MaxKeys=10)
return paginator.paginate(Bucket=bucket, Prefix=prefix, Delimiter="-")


def read_metadata_from_s3(bucket: str, key: str, aws_session: boto3.Session) -> tuple[BoundingBox, CRS, int, int]:
Expand Down Expand Up @@ -140,33 +142,38 @@ def add_assets_to_item(item: pystac.Item, asset_dict: dict[str, pystac.Asset]) -


def create_item(aws_session: boto3.Session, bucket: str, tile: str) -> pystac.Item:
client = aws_session.client("s3")
parameters = client.list_objects(Bucket=bucket, Prefix=tile, Delimiter=".")["CommonPrefixes"]
asset_keys = [parameter["Prefix"] + "tif" for parameter in parameters]
_, tail = os.path.split(asset_keys[0])
product_id = "_".join((tail[:23], tail[29:31]))
bounds, crs, height, width, created = read_metadata_from_s3(bucket, asset_keys[0], aws_session)
geom_wgs84 = get_geom_wgs84(bounds, crs)
description = get_description(product_id)
start_datetime, end_datetime = get_datetime(product_id)

# core metadata
item = create_core_item(product_id, geom_wgs84, start_datetime, end_datetime, created, description, COLLECTION_ID)

# common metadata
provider_list = [VPP_HOST_AND_LICENSOR, VPP_PRODUCER_AND_PROCESSOR]
add_providers_to_item(item, provider_list)

# extensions
add_projection_extension_to_item(item, crs, bounds, height, width)

# links
link_list = [CLMS_LICENSE, CLMS_CATALOG_LINK, ITEM_PARENT_LINK, COLLECTION_LINK]
add_links_to_item(item, link_list)

# assets
assets = {os.path.split(key)[-1][:-4].lower(): create_asset(key) for key in asset_keys}
add_assets_to_item(item, assets)
try:
client = aws_session.client("s3")
parameters = client.list_objects(Bucket=bucket, Prefix=tile, Delimiter=".")["CommonPrefixes"]
asset_keys = [parameter["Prefix"] + "tif" for parameter in parameters]
_, tail = os.path.split(asset_keys[0])
product_id = "_".join((tail[:23], tail[29:31]))
bounds, crs, height, width, created = read_metadata_from_s3(bucket, asset_keys[0], aws_session)
geom_wgs84 = get_geom_wgs84(bounds, crs)
description = get_description(product_id)
start_datetime, end_datetime = get_datetime(product_id)

# core metadata
item = create_core_item(
product_id, geom_wgs84, start_datetime, end_datetime, created, description, COLLECTION_ID
)

# common metadata
provider_list = [VPP_HOST_AND_LICENSOR, VPP_PRODUCER_AND_PROCESSOR]
add_providers_to_item(item, provider_list)

# extensions
add_projection_extension_to_item(item, crs, bounds, height, width)

# links
link_list = [CLMS_LICENSE, CLMS_CATALOG_LINK, ITEM_PARENT_LINK, COLLECTION_LINK]
add_links_to_item(item, link_list)

# assets
assets = {os.path.split(key)[-1][:-4].lower(): create_asset(key) for key in asset_keys}
add_assets_to_item(item, assets)
except Exception as error:
raise ItemCreationError(error)
return item


Expand All @@ -186,5 +193,5 @@ def create_vpp_item(aws_session: boto3.Session, bucket: str, validator: Draft7Va
error_msg = best_match(validator.iter_errors(item.to_dict()))
assert error_msg is None, f"Failed to create {item.id} item. Reason: {error_msg}."
item.save_object()
except (AssertionError, BotoCoreError, RasterioIOError) as error:
except (AssertionError, ItemCreationError) as error:
LOGGER.error(error)

0 comments on commit d11641f

Please sign in to comment.