Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add promote_to_multitype keyword in write_dataframe #75

Merged
Merged
Show file tree
Hide file tree
Changes from 41 commits
Commits
Show all changes
42 commits
Select commit Hold shift + click to select a range
e3f0541
Add support for force_multitype
theroggy Apr 17, 2022
0ad3480
Move to_multitype code to before feature creation
theroggy Apr 18, 2022
307e928
Add test
theroggy Apr 18, 2022
be42177
Rename force_multitype to promote_to_multitype
theroggy Apr 19, 2022
b79673c
Automatic mode for promote_to_multi+improve tests
theroggy Apr 19, 2022
088f3f9
Add to changelog
theroggy Apr 19, 2022
dbcd058
Improve inline documentation
theroggy Apr 19, 2022
f4a0cfd
Remove type annotations
theroggy Apr 19, 2022
716c11d
typo
theroggy Apr 19, 2022
5a123de
Apply feedback on pull request
theroggy Apr 19, 2022
4c392f2
Update pyogrio/_geometry.pyx
theroggy Apr 21, 2022
d710a10
Update pyogrio/_io.pyx
theroggy Apr 21, 2022
2dba836
Update pyogrio/geopandas.py
theroggy Apr 21, 2022
b8470c0
Skip the test using pygeos if not available
theroggy Apr 21, 2022
f4ab989
Sort on values is sufficient: remove sort on geom
theroggy Apr 22, 2022
3bca6a1
Merge remote-tracking branch 'upstream/main' into to_multitype-option…
theroggy Apr 26, 2022
769553b
Add some extensions to ALL_EXTS
theroggy Apr 27, 2022
4a7b9c9
Remove promote_to_multi bool from API
theroggy Apr 27, 2022
92f8d81
Fix tests for file formats that reorder rows
theroggy Apr 27, 2022
f0c4aa1
POC of demote_to_single
theroggy Apr 28, 2022
9d35aca
Merge remote-tracking branch 'upstream/main' into to_multitype-option…
theroggy May 1, 2022
9819ee9
Revert "POC of demote_to_single"
theroggy May 1, 2022
3fa9c50
Change promote functionality to string argument
theroggy May 2, 2022
4781e6e
Remove asserts on GeoDataFrame
theroggy May 11, 2022
8b59c78
Evade having to sort results in tests for .fgb
theroggy May 14, 2022
b13a56e
Make promote_to_multi a seperate param again
theroggy May 15, 2022
eaf369b
Small improvements
theroggy May 15, 2022
49d500b
Even smaller changes
theroggy May 15, 2022
9d78d29
Improve documentation
theroggy May 15, 2022
6856c5b
Merge remote-tracking branch 'upstream/main' into to_multitype-option…
theroggy May 15, 2022
e5d1ad3
Fix promotion if only single geometries to multi
theroggy May 15, 2022
aae713b
Improve inline documentation
theroggy May 16, 2022
4a04c32
Textual improvements to documentation
theroggy May 16, 2022
5ba0e27
Small improvements
theroggy May 16, 2022
e88fc71
Textual change in documentation
theroggy May 16, 2022
fe36164
Expand tests on layer_geometry_type
theroggy May 17, 2022
e7ba447
Add some shapefile tests as well
theroggy May 17, 2022
74d7531
Improvements + fixes
theroggy May 17, 2022
b4091d3
Remove obsolete to_multipolygon function
theroggy May 17, 2022
43c4024
Merge remote-tracking branch 'upstream/main' into to_multitype-option…
theroggy May 18, 2022
ef49019
Fix test_read_autodetect_driver
theroggy May 18, 2022
d173e41
Update pyogrio/tests/test_geopandas_io.py
theroggy May 21, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGES.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
- generalize check for VSI files from `/vsizip` to `/vsi` (#29)
- add dtype for each field to `read_info` (#30)
- support writing empty GeoDataFrames (#38)
- add keyword to promote mixed singular/multi geometry column to multi geometry type (#56)
- support use of a sql statement in read_dataframe (#70)

### Breaking changes
Expand Down
15 changes: 12 additions & 3 deletions pyogrio/_io.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -1118,7 +1118,7 @@ cdef infer_field_types(list dtypes):
# TODO: handle updateable data sources, like GPKG
# TODO: set geometry and field data as memory views?
def ogr_write(str path, str layer, str driver, geometry, field_data, fields,
str crs, str geometry_type, str encoding, **kwargs):
str crs, str geometry_type, str encoding, bint promote_to_multi=False, **kwargs):

cdef const char *path_c = NULL
cdef const char *layer_c = NULL
Expand All @@ -1131,6 +1131,7 @@ def ogr_write(str path, str layer, str driver, geometry, field_data, fields,
cdef OGRLayerH ogr_layer = NULL
cdef OGRFeatureH ogr_feature = NULL
cdef OGRGeometryH ogr_geometry = NULL
cdef OGRGeometryH ogr_geometry_multi = NULL
cdef OGRFeatureDefnH ogr_featuredef = NULL
cdef OGRFieldDefnH ogr_fielddef = NULL
cdef unsigned char *wkb_buffer = NULL
Expand Down Expand Up @@ -1237,7 +1238,7 @@ def ogr_write(str path, str layer, str driver, geometry, field_data, fields,
### Create the layer
try:
ogr_layer = exc_wrap_pointer(
GDALDatasetCreateLayer(ogr_dataset, layer_c, ogr_crs,
GDALDatasetCreateLayer(ogr_dataset, layer_c, ogr_crs,
<OGRwkbGeometryType>geometry_code, options))

except Exception as exc:
Expand All @@ -1254,7 +1255,6 @@ def ogr_write(str path, str layer, str driver, geometry, field_data, fields,
CSLDestroy(<char**>options)
options = NULL


### Create the fields
field_types = infer_field_types([field.dtype for field in field_data])
for i in range(num_fields):
Expand Down Expand Up @@ -1325,6 +1325,15 @@ def ogr_write(str path, str layer, str driver, geometry, field_data, fields,
ogr_geometry = NULL
raise GeometryError(f"Could not create geometry from WKB at index {i}") from None

# Convert to multi type
if promote_to_multi:
if wkbtype in (wkbPoint, wkbPoint25D, wkbPointM, wkbPointZM):
ogr_geometry = OGR_G_ForceToMultiPoint(ogr_geometry)
elif wkbtype in (wkbLineString, wkbLineString25D, wkbLineStringM, wkbLineStringZM):
ogr_geometry = OGR_G_ForceToMultiLineString(ogr_geometry)
elif wkbtype in (wkbPolygon, wkbPolygon25D, wkbPolygonM, wkbPolygonZM):
ogr_geometry = OGR_G_ForceToMultiPolygon(ogr_geometry)

# Set the geometry on the feature
# this assumes ownership of the geometry and it's cleanup
err = OGR_F_SetGeometryDirectly(ogr_feature, ogr_geometry)
Expand Down
3 changes: 3 additions & 0 deletions pyogrio/_ogr.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -240,6 +240,9 @@ cdef extern from "ogr_api.h":
int OGR_G_Is3D(OGRGeometryH geometry)
void OGR_G_Set3D(OGRGeometryH geometry, int is3D)
int OGR_G_WkbSize(OGRGeometryH geometry)
OGRGeometryH OGR_G_ForceToMultiPoint(OGRGeometryH geometry)
OGRGeometryH OGR_G_ForceToMultiLineString(OGRGeometryH geometry)
OGRGeometryH OGR_G_ForceToMultiPolygon(OGRGeometryH geometry)

int OGR_GT_HasM(OGRwkbGeometryType eType)
int OGR_GT_HasZ(OGRwkbGeometryType eType)
Expand Down
90 changes: 75 additions & 15 deletions pyogrio/geopandas.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from pyogrio.raw import read, write
from pyogrio.raw import DRIVERS_NO_MIXED_SINGLE_MULTI
from pyogrio.raw import detect_driver, read, write


def _stringify_path(path):
Expand Down Expand Up @@ -166,7 +167,14 @@ def read_dataframe(

# TODO: handle index properly
def write_dataframe(
df, path, layer=None, driver=None, encoding=None, geometry_type=None, **kwargs
df,
path,
layer=None,
driver=None,
encoding=None,
layer_geometry_type=None,
promote_to_multi=None,
**kwargs,
):
"""
Write GeoPandas GeoDataFrame to an OGR file format.
Expand All @@ -183,13 +191,27 @@ def write_dataframe(
encoding : str, optional (default: None)
If present, will be used as the encoding for writing string values to
the file.
geometry_type : string, optional (default: None)
The geometry type for the dataset layer that will be written.
By default will be inferred from the data, but this parameter allows you
to override this and specify the geometry type manually. Possible
values: 'Unknown', 'Point', 'LineString', 'Polygon', 'MultiPoint',
'MultiLineString', 'MultiPolygon', 'GeometryCollection'.

layer_geometry_type : string, optional (default: None)
By default, the geometry type of the layer will be inferred from the
data, after applying the promote_to_multi logic. If the data only contains a
single geometry type (after applying the logic of promote_to_multi), this type
is used for the layer. If the data (still) contains mixed geometry types, the
output layer geometry type will be set to "Unknown".

This parameter does not modify the geometry, but it will try to force the layer
type of the output file to this value. Use this parameter with caution because
using a non-default layer geometry type may result in errors when writing the
file, may be ignored by the driver, or may result in invalid files. Possible
values are: "Unknown", "Point", "LineString", "Polygon", "MultiPoint",
"MultiLineString", "MultiPolygon" or "GeometryCollection".
promote_to_multi : bool, optional (default: None)
If True, will convert singular geometry types in the data to their
corresponding multi geometry type for writing. By default, will convert
mixed singular and multi geometry types to multi geometry types for drivers
that do not support mixed singular and multi geometry types. If False, geometry
types will not be promoted, which may result in errors or invalid files when
attempting to write mixed singular and multi geometry types to drivers that do
not support such combinations.
**kwargs
The kwargs passed to OGR.
"""
Expand All @@ -209,6 +231,9 @@ def write_dataframe(
if not isinstance(df, gp.GeoDataFrame):
raise ValueError("'df' must be a GeoDataFrame")

if driver is None:
driver = detect_driver(path)

geometry_columns = df.columns[df.dtypes == "geometry"]
if len(geometry_columns) == 0:
raise ValueError("'df' does not have a geometry column")
Expand All @@ -226,13 +251,47 @@ def write_dataframe(
# TODO: may need to fill in pd.NA, etc
field_data = [df[f].values for f in fields]

if geometry_type is None:
geometry_type = "Unknown"
# Determine layer_geometry_type and/or promote_to_multi
if layer_geometry_type is None or promote_to_multi is None:
tmp_layer_geometry_type = "Unknown"

# If there is data, infer layer geometry type + promote_to_multi
if not df.empty:
# TODO: validate geometry types, not all combinations are valid
geometry_types = geometry.type.unique()
if len(geometry_types) == 1:
geometry_type = geometry_types[0]
tmp_layer_geometry_type = geometry_types[0]
if promote_to_multi and tmp_layer_geometry_type in (
"Point",
"LineString",
"Polygon",
):
tmp_layer_geometry_type = f"Multi{tmp_layer_geometry_type}"
elif len(geometry_types) == 2:
# Check if the types are corresponding multi + single types
if "Polygon" in geometry_types and "MultiPolygon" in geometry_types:
multi_type = "MultiPolygon"
elif (
"LineString" in geometry_types
and "MultiLineString" in geometry_types
):
multi_type = "MultiLineString"
elif "Point" in geometry_types and "MultiPoint" in geometry_types:
multi_type = "MultiPoint"
else:
multi_type = None

# If they are corresponding multi + single types
if multi_type is not None:
if (
promote_to_multi is None
and driver in DRIVERS_NO_MIXED_SINGLE_MULTI
):
promote_to_multi = True
if promote_to_multi:
tmp_layer_geometry_type = multi_type

if layer_geometry_type is None:
layer_geometry_type = tmp_layer_geometry_type

crs = None
if geometry.crs:
Expand All @@ -252,7 +311,8 @@ def write_dataframe(
field_data=field_data,
fields=fields,
crs=crs,
geometry_type=geometry_type,
geometry_type=layer_geometry_type,
encoding=encoding,
**kwargs
promote_to_multi=promote_to_multi,
**kwargs,
)
62 changes: 40 additions & 22 deletions pyogrio/raw.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,22 +2,27 @@
import os

from pyogrio._env import GDALEnv
from pyogrio.errors import DataSourceError
from pyogrio.util import vsi_path

with GDALEnv():
from pyogrio._io import ogr_read, ogr_read_info, ogr_list_layers, ogr_write
from pyogrio._io import ogr_read, ogr_write
from pyogrio._ogr import buffer_to_virtual_file, remove_virtual_file


DRIVERS = {
".gpkg": "GPKG",
".shp": "ESRI Shapefile",
".json": "GeoJSON",
".fgb": "FlatGeobuf",
".geojson": "GeoJSON",
".geojsons": "GeoJSONSeq",
".geojsonl": "GeoJSONSeq",
".fgb": "FlatGeobuf",
".geojsons": "GeoJSONSeq",
".gpkg": "GPKG",
".json": "GeoJSON",
".shp": "ESRI Shapefile",
}


DRIVERS_NO_MIXED_SINGLE_MULTI = {
"FlatGeobuf",
"GPKG",
}


Expand Down Expand Up @@ -113,7 +118,7 @@ def read(
if isinstance(path_or_buffer, bytes):
from_buffer = True
ext = ""
is_zipped = path_or_buffer[:4].startswith(b'PK\x03\x04')
is_zipped = path_or_buffer[:4].startswith(b"PK\x03\x04")
if is_zipped:
ext = ".zip"
path = buffer_to_virtual_file(path_or_buffer, ext=ext)
Expand Down Expand Up @@ -146,6 +151,24 @@ def read(
return result


def detect_driver(path):
# try to infer driver from path
parts = os.path.splitext(path)
if len(parts) != 2:
raise ValueError(
f"Could not infer driver from path: {path}; please specify driver explicitly"
)

ext = parts[1].lower()
driver = DRIVERS.get(ext, None)
if driver is None:
raise ValueError(
f"Could not infer driver from path: {path}; please specify driver explicitly"
)

return driver


def write(
path,
geometry,
Expand All @@ -157,26 +180,20 @@ def write(
geometry_type=None,
crs=None,
encoding=None,
promote_to_multi=None,
**kwargs,
):

if geometry_type is None:
raise ValueError("geometry_type must be provided")

if driver is None:
# try to infer driver from path
parts = os.path.splitext(path)
if len(parts) != 2:
raise ValueError(
f"Could not infer driver from path: {path}; please specify driver explicitly"
)

ext = parts[1].lower()
driver = DRIVERS.get(ext, None)
if driver is None:
raise ValueError(
f"Could not infer driver from path: {path}; please specify driver explicitly"
)
driver = detect_driver(path)

if promote_to_multi is None:
promote_to_multi = (
geometry_type.startswith("Multi")
and driver in DRIVERS_NO_MIXED_SINGLE_MULTI
)

if crs is None:
warnings.warn(
Expand All @@ -195,5 +212,6 @@ def write(
fields=fields,
crs=crs,
encoding=encoding,
promote_to_multi=promote_to_multi,
**kwargs,
)
8 changes: 6 additions & 2 deletions pyogrio/tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@


_data_dir = Path(__file__).parent.resolve() / "fixtures"
ALL_EXTS = [".shp", ".gpkg", ".geojson", ".geojsonl"]
ALL_EXTS = [".fgb", ".geojson", ".geojsonl", ".gpkg", ".shp"]


def pytest_report_header(config):
Expand All @@ -28,7 +28,11 @@ def prepare_testfile(testfile_path, dst_dir, ext):
if dst_path.exists():
return dst_path
gdf = pyogrio.read_dataframe(testfile_path)
pyogrio.write_dataframe(gdf, dst_path)
if ext == ".fgb":
# For .fgb, spatial_index=False to avoid the rows being reordered
pyogrio.write_dataframe(gdf, dst_path, spatial_index=False)
else:
pyogrio.write_dataframe(gdf, dst_path)
return dst_path


Expand Down
Loading