Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

BUG: fix encoding issues on windows for some formats #361

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
33 commits
Select commit Hold shift + click to select a range
37f5964
ENH: improve handling of encoding on windows
theroggy Feb 23, 2024
e6d4169
Add test to check encoding used to write csv
theroggy Feb 23, 2024
a5b6891
Fix writing string data in correct encoding
theroggy Feb 23, 2024
18e65f7
Encoding detection when writing based on driver
theroggy Feb 23, 2024
b6b3f6a
encode string values to the detected encoding
theroggy Feb 23, 2024
698ad27
Rollback unneeded change
theroggy Feb 23, 2024
28b30d9
replace filecmp with manual check
theroggy Feb 23, 2024
b55a4cd
encoding can only be determined after creating the output layer
theroggy Feb 23, 2024
c22d6ad
Fix xlsx encoding
theroggy Feb 24, 2024
fcfdd31
Always encode field names in UTF8
theroggy Feb 24, 2024
3ced6e7
Add GeoJSONSeq to be utf8 for old gdal versions
theroggy Feb 24, 2024
4cc390d
Update CHANGES.md
theroggy Feb 24, 2024
4f6fe84
Update CHANGES.md
theroggy Feb 24, 2024
70c71c1
Try disabling the XLSX utf8 hardcoding use as it should be redundant
theroggy Feb 24, 2024
4814c00
Add logging regarding locale.setpreferredencoding
theroggy Feb 24, 2024
9d8680d
Move encoding detection to where the layer has already been further d…
theroggy Feb 24, 2024
d0fcd0b
Move detection to after where transation is started
theroggy Feb 24, 2024
b0911b9
Rollback changes to debug XLSX issue in gdal
theroggy Feb 24, 2024
2e7f138
Add column name with special characters in csv tests
theroggy Feb 24, 2024
9723475
Update test_geopandas_io.py
theroggy Feb 24, 2024
78a30b4
Specify different encodings in csv tests
theroggy Feb 26, 2024
85cbbe8
Merge remote-tracking branch 'upstream/main' into ENH-improve-handlin…
theroggy Mar 1, 2024
e7141c5
Set default encoding for shapefile to UTF-8 for all platforms.
theroggy Mar 1, 2024
5c3004a
Merge remote-tracking branch 'upstream/main' into ENH-improve-handlin…
theroggy Mar 5, 2024
b3d31d8
Improve changelog entry for PR 366 (arrow metadata)
theroggy Mar 5, 2024
ea061e0
Add UTF-8 shapefiles to changelog
theroggy Mar 5, 2024
9255602
Centralize getprefferedencoding + improve inline doc
theroggy Mar 5, 2024
f836bd2
Add check that shp is written in UTF-8 by default on all platforms
theroggy Mar 5, 2024
350fe0d
Simplify detect_encoding again + improve doc
theroggy Mar 5, 2024
ba9b808
Add encoding test for shp writing
theroggy Mar 5, 2024
1cc34d5
Add more documentation
theroggy Mar 5, 2024
23ec6f7
Update _io.pyx
theroggy Mar 5, 2024
c6a9625
Merge branch 'main' into ENH-improve-handling-of-encoding-on-windows
brendan-ward Apr 4, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions CHANGES.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@

- Fix error in `write_dataframe` if input has a date column and
non-consecutive index values (#325).
- Fix encoding issues on windows for some formats (e.g. ".csv") and always write ESRI
Shapefiles using UTF-8 by default on all platforms (#361).
- Raise exception in `read_arrow` or `read_dataframe(..., use_arrow=True)` if
a boolean column is detected due to error in GDAL reading boolean values (#335)
this has been fixed in GDAL >= 3.8.3.
Expand Down
75 changes: 47 additions & 28 deletions pyogrio/_io.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -461,9 +461,11 @@ cdef get_metadata(GDALMajorObjectH obj):


cdef detect_encoding(OGRDataSourceH ogr_dataset, OGRLayerH ogr_layer):
"""Attempt to detect the encoding of the layer.
If it supports UTF-8, use that.
If it is a shapefile, it must otherwise be ISO-8859-1.
"""Attempt to detect the encoding to use to read/write string values.

If the layer/dataset supports reading/writing data in UTF-8, returns UTF-8.
If UTF-8 is not supported and ESRI Shapefile, returns ISO-8859-1
Otherwise the system locale preferred encoding is returned.

Parameters
----------
Expand All @@ -476,18 +478,39 @@ cdef detect_encoding(OGRDataSourceH ogr_dataset, OGRLayerH ogr_layer):
"""

if OGR_L_TestCapability(ogr_layer, OLCStringsAsUTF8):
return 'UTF-8'
# OGR_L_TestCapability returns True for OLCStringsAsUTF8 if GDAL hides encoding
# complexities for this layer/driver type. In this case all string attribute
# values have to be supplied in UTF-8 and values will be returned in UTF-8.
# The encoding used to read/write under the hood depends on the driver used.
# For layers/drivers where False is returned, the string values are written and
# read without recoding. Hence, it is up to you to supply the data in the
# appropriate encoding. More info:
# https://gdal.org/development/rfc/rfc23_ogr_unicode.html#oftstring-oftstringlist-fields
return "UTF-8"

driver = get_driver(ogr_dataset)
if driver == 'ESRI Shapefile':
return 'ISO-8859-1'
if driver == "ESRI Shapefile":
# Typically, OGR_L_TestCapability returns True for OLCStringsAsUTF8 for ESRI
# Shapefile so this won't be reached. However, for old versions of GDAL and/or
# if libraries are missing, this fallback could be needed.
return "ISO-8859-1"

if driver == "OSM":
# always set OSM data to UTF-8
# per https://help.openstreetmap.org/questions/2172/what-encoding-does-openstreetmap-use
return "UTF-8"

return None
if driver in ("XLSX", "ODS"):
# TestCapability for OLCStringsAsUTF8 for XLSX and ODS was False for new files
# being created for GDAL < 3.8.5. Once these versions of GDAL are no longer
# supported, this can be removed.
return "UTF-8"

if driver == "GeoJSONSeq":
# In old gdal versions, OLCStringsAsUTF8 wasn't advertised yet.
return "UTF-8"

return locale.getpreferredencoding()


cdef get_fields(OGRLayerH ogr_layer, str encoding, use_arrow=False):
Expand Down Expand Up @@ -1133,11 +1156,7 @@ def ogr_read(

# Encoding is derived from the user, from the dataset capabilities / type,
# or from the system locale
encoding = (
encoding
or detect_encoding(ogr_dataset, ogr_layer)
or locale.getpreferredencoding()
)
encoding = encoding or detect_encoding(ogr_dataset, ogr_layer)

fields = get_fields(ogr_layer, encoding)

Expand Down Expand Up @@ -1323,11 +1342,7 @@ def ogr_open_arrow(

# Encoding is derived from the user, from the dataset capabilities / type,
# or from the system locale
encoding = (
encoding
or detect_encoding(ogr_dataset, ogr_layer)
or locale.getpreferredencoding()
)
encoding = encoding or detect_encoding(ogr_dataset, ogr_layer)

fields = get_fields(ogr_layer, encoding, use_arrow=True)

Expand Down Expand Up @@ -1518,11 +1533,7 @@ def ogr_read_info(

# Encoding is derived from the user, from the dataset capabilities / type,
# or from the system locale
encoding = (
encoding
or detect_encoding(ogr_dataset, ogr_layer)
or locale.getpreferredencoding()
)
encoding = encoding or detect_encoding(ogr_dataset, ogr_layer)

fields = get_fields(ogr_layer, encoding)

Expand Down Expand Up @@ -1756,6 +1767,12 @@ cdef create_ogr_dataset_layer(
the caller still needs to set up the layer definition, i.e. create the
fields).

Parameters
----------
encoding : str
Only used if `driver` is "ESRI Shapefile". If not None, it overrules the default
shapefile encoding, which is "UTF-8" in pyogrio.

Returns
-------
bool :
Expand Down Expand Up @@ -1847,6 +1864,8 @@ cdef create_ogr_dataset_layer(
if driver == 'ESRI Shapefile':
# Fiona only sets encoding for shapefiles; other drivers do not support
# encoding as an option.
if encoding is None:
encoding = "UTF-8"
encoding_b = encoding.upper().encode('UTF-8')
encoding_c = encoding_b
layer_options = CSLSetNameValue(layer_options, "ENCODING", encoding_c)
Expand Down Expand Up @@ -1962,22 +1981,23 @@ def ogr_write(
gdal_tz_offsets = {}

### Setup up dataset and layer
if not encoding:
encoding = locale.getpreferredencoding()

layer_created = create_ogr_dataset_layer(
path, layer, driver, crs, geometry_type, encoding,
dataset_kwargs, layer_kwargs, append,
dataset_metadata, layer_metadata,
&ogr_dataset, &ogr_layer,
)

# Now the dataset and layer have been created, we can properly determine the
# encoding. It is derived from the user, from the dataset capabilities / type,
# or from the system locale
encoding = encoding or detect_encoding(ogr_dataset, ogr_layer)

### Create the fields
field_types = None
if num_fields > 0:
field_types = infer_field_types([field.dtype for field in field_data])

### Create the fields
if layer_created:
for i in range(num_fields):
field_type, field_subtype, width, precision = field_types[i]
Expand Down Expand Up @@ -2076,7 +2096,6 @@ def ogr_write(
OGR_F_SetFieldNull(ogr_feature, field_idx)

elif field_type == OFTString:
# TODO: encode string using approach from _get_internal_encoding which checks layer capabilities
if (
field_value is None
or (isinstance(field_value, float) and isnan(field_value))
Expand All @@ -2088,7 +2107,7 @@ def ogr_write(
field_value = str(field_value)

try:
value_b = field_value.encode("UTF-8")
value_b = field_value.encode(encoding)
OGR_F_SetFieldString(ogr_feature, field_idx, value_b)

except AttributeError:
Expand Down
53 changes: 53 additions & 0 deletions pyogrio/tests/test_geopandas_io.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,26 @@ def spatialite_available(path):
return False


@pytest.mark.parametrize("encoding", ["utf-8", "cp1252", None])
def test_read_csv_encoding(tmp_path, encoding):
# Write csv test file. Depending on the os this will be written in a different
# encoding: for linux and macos this is utf-8, for windows it is cp1252.
csv_path = tmp_path / "test.csv"
with open(csv_path, "w", encoding=encoding) as csv:
csv.write("näme,city\n")
csv.write("Wilhelm Röntgen,Zürich\n")

# Read csv. The data should be read with the same default encoding as the csv file
# was written in, but should have been converted to utf-8 in the dataframe returned.
# Hence, the asserts below, with strings in utf-8, be OK.
df = read_dataframe(csv_path, encoding=encoding)

assert len(df) == 1
assert df.columns.tolist() == ["näme", "city"]
assert df.city.tolist() == ["Zürich"]
assert df.näme.tolist() == ["Wilhelm Röntgen"]


def test_read_dataframe(naturalearth_lowres_all_ext):
df = read_dataframe(naturalearth_lowres_all_ext)

Expand Down Expand Up @@ -792,6 +812,39 @@ def test_read_sql_dialect_sqlite_gpkg(naturalearth_lowres, use_arrow):
assert df.iloc[0].geometry.area > area_canada


@pytest.mark.parametrize("encoding", ["utf-8", "cp1252", None])
def test_write_csv_encoding(tmp_path, encoding):
"""Test if write_dataframe uses the default encoding correctly."""
# Write csv test file. Depending on the os this will be written in a different
# encoding: for linux and macos this is utf-8, for windows it is cp1252.
csv_path = tmp_path / "test.csv"

with open(csv_path, "w", encoding=encoding) as csv:
csv.write("näme,city\n")
csv.write("Wilhelm Röntgen,Zürich\n")

# Write csv test file with the same data using write_dataframe. It should use the
# same encoding as above.
df = pd.DataFrame({"näme": ["Wilhelm Röntgen"], "city": ["Zürich"]})
csv_pyogrio_path = tmp_path / "test_pyogrio.csv"
write_dataframe(df, csv_pyogrio_path, encoding=encoding)

# Check if the text files written both ways can be read again and give same result.
with open(csv_path, "r", encoding=encoding) as csv:
csv_str = csv.read()
with open(csv_pyogrio_path, "r", encoding=encoding) as csv_pyogrio:
csv_pyogrio_str = csv_pyogrio.read()
assert csv_str == csv_pyogrio_str

# Check if they files are binary identical, to be 100% sure they were written with
# the same encoding.
with open(csv_path, "rb") as csv:
csv_bytes = csv.read()
with open(csv_pyogrio_path, "rb") as csv_pyogrio:
csv_pyogrio_bytes = csv_pyogrio.read()
assert csv_bytes == csv_pyogrio_bytes


@pytest.mark.parametrize("ext", ALL_EXTS)
def test_write_dataframe(tmp_path, naturalearth_lowres, ext):
input_gdf = read_dataframe(naturalearth_lowres)
Expand Down
10 changes: 9 additions & 1 deletion pyogrio/tests/test_raw_io.py
Original file line number Diff line number Diff line change
Expand Up @@ -420,16 +420,24 @@ def test_read_return_only_fids(naturalearth_lowres):
assert len(field_data) == 0


def test_write(tmpdir, naturalearth_lowres):
@pytest.mark.parametrize("encoding", [None, "ISO-8859-1"])
def test_write_shp(tmpdir, naturalearth_lowres, encoding):
meta, _, geometry, field_data = read(naturalearth_lowres)

filename = os.path.join(str(tmpdir), "test.shp")
meta["encoding"] = encoding
write(filename, geometry, field_data, **meta)

assert os.path.exists(filename)
for ext in (".dbf", ".prj"):
assert os.path.exists(filename.replace(".shp", ext))

# We write shapefiles in UTF-8 by default on all platforms
expected_encoding = encoding if encoding is not None else "UTF-8"
with open(filename.replace(".shp", ".cpg")) as cpg_file:
result_encoding = cpg_file.read()
assert result_encoding == expected_encoding


def test_write_gpkg(tmpdir, naturalearth_lowres):
meta, _, geometry, field_data = read(naturalearth_lowres)
Expand Down