Skip to content

Commit

Permalink
Merge branch 'main' into typos
Browse files Browse the repository at this point in the history
  • Loading branch information
mercedes1996 authored Feb 20, 2025
2 parents f423188 + 7595386 commit 947b394
Show file tree
Hide file tree
Showing 147 changed files with 809 additions and 14 deletions.
1 change: 1 addition & 0 deletions docs/api/datasets.rst
Original file line number Diff line number Diff line change
Expand Up @@ -220,6 +220,7 @@ BigEarthNet
^^^^^^^^^^^

.. autoclass:: BigEarthNet
.. autoclass:: BigEarthNetV2

BioMassters
^^^^^^^^^^^
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ maintainers = [
]
keywords = ["pytorch", "deep learning", "machine learning", "remote sensing", "satellite imagery", "earth observation", "geospatial"]
classifiers = [
"Development Status :: 3 - Alpha",
"Development Status :: 4 - Beta",
"Intended Audience :: Science/Research",
"License :: OSI Approved :: MIT License",
"Operating System :: OS Independent",
Expand Down
6 changes: 3 additions & 3 deletions requirements/datasets.txt
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
# datasets
h5py==3.12.1
h5py==3.13.0
laspy==2.5.4
netcdf4==1.7.2
opencv-python==4.11.0.86
pandas[parquet]==2.2.3
pycocotools==2.0.8
scikit-image==0.25.1
scikit-image==0.25.2
scipy==1.15.2
xarray==2024.11.0
xarray==2025.1.2
2 changes: 1 addition & 1 deletion requirements/required.txt
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ setuptools==75.8.0
einops==0.8.1
fiona==1.10.1
kornia==0.8.0
lightly==1.5.18
lightly==1.5.19
lightning[pytorch-extra]==2.5.0.post0
matplotlib==3.10.0
numpy==2.2.3
Expand Down
2 changes: 1 addition & 1 deletion tests/conf/bigearthnet_all.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,6 @@ data:
init_args:
batch_size: 1
dict_kwargs:
root: 'tests/data/bigearthnet'
root: 'tests/data/bigearthnet/v1'
bands: 'all'
num_classes: 19
2 changes: 1 addition & 1 deletion tests/conf/bigearthnet_s1.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,6 @@ data:
init_args:
batch_size: 1
dict_kwargs:
root: 'tests/data/bigearthnet'
root: 'tests/data/bigearthnet/v1'
bands: 's1'
num_classes: 19
2 changes: 1 addition & 1 deletion tests/conf/bigearthnet_s2.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,6 @@ data:
init_args:
batch_size: 1
dict_kwargs:
root: 'tests/data/bigearthnet'
root: 'tests/data/bigearthnet/v1'
bands: 's2'
num_classes: 19
File renamed without changes.
File renamed without changes.
Binary file added tests/data/bigearthnet/v2/BigEarthNet-S1.tar.gzaa
Binary file not shown.
Binary file added tests/data/bigearthnet/v2/BigEarthNet-S1.tar.gzab
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file added tests/data/bigearthnet/v2/Reference_Maps.tar.gzaa
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
238 changes: 238 additions & 0 deletions tests/data/bigearthnet/v2/data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,238 @@
#!/usr/bin/env python3

# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.

import hashlib
import os
import shutil
from pathlib import Path

import numpy as np
import pandas as pd
import rasterio

# Constants
IMG_SIZE = 120
ROOT_DIR = '.'
CHUNK_SIZE = 2**12

# Sample patch definitions
SAMPLE_PATCHES = [
{
's2_name': 'S2A_MSIL2A_20170613T101031_N9999_R022_T33UUP_26_57',
's2_base': 'S2A_MSIL2A_20170613T101031_N9999_R022_T33UUP',
's1_name': 'S1A_IW_GRDH_1SDV_20170613T165043_33UUP_61_39',
's1_base': 'S1A_IW_GRDH_1SDV_20170613T165043',
'split': 'train',
'labels': [
'Urban fabric',
'Industrial or commercial units',
'Complex cultivation patterns',
],
},
{
's2_name': 'S2A_MSIL2A_20170614T102021_N9999_R122_T32TQT_45_38',
's2_base': 'S2A_MSIL2A_20170614T102021_N9999_R122_T32TQT',
's1_name': 'S1A_IW_GRDH_1SDV_20170614T165154_32TQT_71_84',
's1_base': 'S1A_IW_GRDH_1SDV_20170614T165154',
'split': 'train',
'labels': [
'Broad-leaved forest',
'Mixed forest',
'Transitional woodland, shrub',
],
},
{
's2_name': 'S2B_MSIL2A_20170615T102019_N9999_R122_T32TNS_45_23',
's2_base': 'S2B_MSIL2A_20170615T102019_N9999_R122_T32TNS',
's1_name': 'S1A_IW_GRDH_1SDV_20170615T170156_32TNS_77_12',
's1_base': 'S1A_IW_GRDH_1SDV_20170615T170156',
'split': 'val',
'labels': ['Arable land', 'Pastures', 'Inland waters'],
},
{
's2_name': 'S2A_MSIL2A_20170618T101021_N9999_R022_T32TQR_89_34',
's2_base': 'S2A_MSIL2A_20170618T101021_N9999_R022_T32TQR',
's1_name': 'S1A_IW_GRDH_1SDV_20170618T165722_32TQR_92_45',
's1_base': 'S1A_IW_GRDH_1SDV_20170618T165722',
'split': 'test',
'labels': [
'Coniferous forest',
'Natural grassland and sparsely vegetated areas',
],
},
]

LABEL_TO_CLC = {
'Urban fabric': 111,
'Industrial or commercial units': 121,
'Arable land': 211,
'Permanent crops': 221,
'Pastures': 231,
'Complex cultivation patterns': 242,
'Land principally occupied by agriculture, with significant areas of natural vegetation': 243,
'Agro-forestry areas': 244,
'Broad-leaved forest': 311,
'Coniferous forest': 312,
'Mixed forest': 313,
'Natural grassland and sparsely vegetated areas': 321,
'Moors, heathland and sclerophyllous vegetation': 322,
'Transitional woodland, shrub': 324,
'Beaches, dunes, sands': 331,
'Inland wetlands': 411,
'Coastal wetlands': 421,
'Inland waters': 511,
'Marine waters': 523,
}

S1_BANDS = ['VV', 'VH']
S2_BANDS = [
'B01',
'B02',
'B03',
'B04',
'B05',
'B06',
'B07',
'B08',
'B8A',
'B09',
'B11',
'B12',
]


def create_directory_structure() -> None:
"""Create the base directory structure"""

for dir_name in ['BigEarthNet-S1', 'BigEarthNet-S2', 'Reference_Maps']:
if os.path.exists(os.path.join(ROOT_DIR, dir_name)):
shutil.rmtree(os.path.join(ROOT_DIR, dir_name))
Path(os.path.join(ROOT_DIR, dir_name)).mkdir(parents=True, exist_ok=True)


def create_dummy_image(
path: str, shape: tuple[int, int], dtype: str, labels: list[str] | None = None
) -> None:
"""Create a dummy GeoTIFF file"""
if dtype == 's1':
data = np.random.randint(-25, 0, shape).astype(np.int16)
elif dtype == 's2':
data = np.random.randint(0, 10000, shape).astype(np.int16)
else: # reference map
clc_codes = [LABEL_TO_CLC[label] for label in labels]
data = np.random.choice(clc_codes, size=shape).astype(np.uint16)

with rasterio.open(
path,
'w',
driver='GTiff',
height=shape[0],
width=shape[1],
count=1,
dtype=data.dtype,
crs='+proj=utm +zone=32 +datum=WGS84 +units=m +no_defs',
transform=rasterio.transform.from_origin(0, 0, 10, 10),
) as dst:
dst.write(data, 1)


def generate_sample(patch_info: dict) -> None:
"""Generate a complete sample with S1, S2 and reference data"""
# Create S1 data
s1_dir = os.path.join(
ROOT_DIR, 'BigEarthNet-S1', patch_info['s1_base'], patch_info['s1_name']
)
os.makedirs(s1_dir, exist_ok=True)

for band in S1_BANDS:
path = os.path.join(s1_dir, f'{patch_info["s1_name"]}_{band}.tif')
create_dummy_image(path, (IMG_SIZE, IMG_SIZE), 's1')

# Create S2 data
s2_dir = os.path.join(
ROOT_DIR, 'BigEarthNet-S2', patch_info['s2_base'], patch_info['s2_name']
)
os.makedirs(s2_dir, exist_ok=True)

for band in S2_BANDS:
path = os.path.join(s2_dir, f'{patch_info["s2_name"]}_{band}.tif')
create_dummy_image(path, (IMG_SIZE, IMG_SIZE), 's2')

# Create reference map
ref_dir = os.path.join(
ROOT_DIR, 'Reference_Maps', patch_info['s2_base'], patch_info['s2_name']
)
os.makedirs(ref_dir, exist_ok=True)

path = os.path.join(ref_dir, f'{patch_info["s2_name"]}_reference_map.tif')
create_dummy_image(
path, (IMG_SIZE, IMG_SIZE), 'reference', labels=patch_info['labels']
)


def create_metadata() -> None:
"""Create metadata parquet file"""
records = []

for patch in SAMPLE_PATCHES:
records.append(
{
'patch_id': patch['s2_name'],
's1_name': patch['s1_name'],
'split': patch['split'],
'labels': patch['labels'],
}
)

df = pd.DataFrame.from_records(records)
df.to_parquet(os.path.join(ROOT_DIR, 'metadata.parquet'))


def main() -> None:
create_directory_structure()

for patch_info in SAMPLE_PATCHES:
generate_sample(patch_info)

create_metadata()

for directory in ['BigEarthNet-S1', 'BigEarthNet-S2', 'Reference_Maps']:
shutil.make_archive(directory, 'gztar', '.', directory)
tar_path = f'{directory}.tar.gz'

split_paths = []
if directory.startswith('BigEarthNet-'):
with open(tar_path, 'rb') as f:
content = f.read()
file_size = len(content)
midpoint = file_size // 2
splits = [content[:midpoint], content[midpoint:]]
suffixes = ['aa', 'ab']
for suf, split_data in zip(suffixes, splits):
split_name = f'{directory}.tar.gz{suf}'
with open(split_name, 'wb') as g:
g.write(split_data)
split_paths.append(split_name)

elif directory == 'Reference_Maps':
# For Reference_Maps, create only one split with suffix "aa"
split_name = f'{directory}.tar.gzaa'
with open(tar_path, 'rb') as f:
with open(split_name, 'wb') as g:
g.write(f.read())
split_paths.append(split_name)

os.remove(tar_path)

for path in split_paths:
hash_md5 = hashlib.md5()
with open(path, 'rb') as f:
for chunk in iter(lambda: f.read(4096), b''):
hash_md5.update(chunk)
print(path, hash_md5.hexdigest())


if __name__ == '__main__':
main()
Binary file added tests/data/bigearthnet/v2/metadata.parquet
Binary file not shown.
Loading

0 comments on commit 947b394

Please sign in to comment.