geopandas · brendan-ward · Apr 4, 2024 · Feb 23, 2024 · Feb 23, 2024 · Feb 23, 2024
diff --git a/CHANGES.md b/CHANGES.md
@@ -14,6 +14,8 @@
 
 -   Fix error in `write_dataframe` if input has a date column and
     non-consecutive index values (#325).
+-   Fix encoding issues on windows for some formats (e.g. ".csv") and always write ESRI
+    Shapefiles using UTF-8 by default on all platforms (#361).
 -   Raise exception in `read_arrow` or `read_dataframe(..., use_arrow=True)` if
     a boolean column is detected due to error in GDAL reading boolean values (#335)
     this has been fixed in GDAL >= 3.8.3.

diff --git a/pyogrio/_io.pyx b/pyogrio/_io.pyx
@@ -461,9 +461,11 @@ cdef get_metadata(GDALMajorObjectH obj):
 
 
 cdef detect_encoding(OGRDataSourceH ogr_dataset, OGRLayerH ogr_layer):
-    """Attempt to detect the encoding of the layer.
-    If it supports UTF-8, use that.
-    If it is a shapefile, it must otherwise be ISO-8859-1.
+    """Attempt to detect the encoding to use to read/write string values.
+
+    If the layer/dataset supports reading/writing data in UTF-8, returns UTF-8.
+    If UTF-8 is not supported and ESRI Shapefile, returns ISO-8859-1
+    Otherwise the system locale preferred encoding is returned.
 
     Parameters
     ----------
@@ -476,18 +478,39 @@ cdef detect_encoding(OGRDataSourceH ogr_dataset, OGRLayerH ogr_layer):
     """
 
     if OGR_L_TestCapability(ogr_layer, OLCStringsAsUTF8):
-        return 'UTF-8'
+        # OGR_L_TestCapability returns True for OLCStringsAsUTF8 if GDAL hides encoding
+        # complexities for this layer/driver type. In this case all string attribute
+        # values have to be supplied in UTF-8 and values will be returned in UTF-8.
+        # The encoding used to read/write under the hood depends on the driver used.
+        # For layers/drivers where False is returned, the string values are written and
+        # read without recoding. Hence, it is up to you to supply the data in the
+        # appropriate encoding. More info:
+        # https://gdal.org/development/rfc/rfc23_ogr_unicode.html#oftstring-oftstringlist-fields
+        return "UTF-8"
 
     driver = get_driver(ogr_dataset)
-    if driver == 'ESRI Shapefile':
-        return 'ISO-8859-1'
+    if driver == "ESRI Shapefile":
+        # Typically, OGR_L_TestCapability returns True for OLCStringsAsUTF8 for ESRI
+        # Shapefile so this won't be reached. However, for old versions of GDAL and/or
+        # if libraries are missing, this fallback could be needed.
+        return "ISO-8859-1"
 
     if driver == "OSM":
         # always set OSM data to UTF-8
         # per https://help.openstreetmap.org/questions/2172/what-encoding-does-openstreetmap-use
         return "UTF-8"
 
-    return None
+    if driver in ("XLSX", "ODS"):
+        # TestCapability for OLCStringsAsUTF8 for XLSX and ODS was False for new files
+        # being created for GDAL < 3.8.5. Once these versions of GDAL are no longer
+        # supported, this can be removed.
+        return "UTF-8"
+
+    if driver == "GeoJSONSeq":
+        # In old gdal versions, OLCStringsAsUTF8 wasn't advertised yet.
+        return "UTF-8"
+
+    return locale.getpreferredencoding()
 
 
 cdef get_fields(OGRLayerH ogr_layer, str encoding, use_arrow=False):
@@ -1133,11 +1156,7 @@ def ogr_read(
 
         # Encoding is derived from the user, from the dataset capabilities / type,
         # or from the system locale
-        encoding = (
-            encoding
-            or detect_encoding(ogr_dataset, ogr_layer)
-            or locale.getpreferredencoding()
-        )
+        encoding = encoding or detect_encoding(ogr_dataset, ogr_layer)
 
         fields = get_fields(ogr_layer, encoding)
 
@@ -1323,11 +1342,7 @@ def ogr_open_arrow(
 
         # Encoding is derived from the user, from the dataset capabilities / type,
         # or from the system locale
-        encoding = (
-            encoding
-            or detect_encoding(ogr_dataset, ogr_layer)
-            or locale.getpreferredencoding()
-        )
+        encoding = encoding or detect_encoding(ogr_dataset, ogr_layer)
 
         fields = get_fields(ogr_layer, encoding, use_arrow=True)
 
@@ -1518,11 +1533,7 @@ def ogr_read_info(
 
         # Encoding is derived from the user, from the dataset capabilities / type,
         # or from the system locale
-        encoding = (
-            encoding
-            or detect_encoding(ogr_dataset, ogr_layer)
-            or locale.getpreferredencoding()
-        )
+        encoding = encoding or detect_encoding(ogr_dataset, ogr_layer)
 
         fields = get_fields(ogr_layer, encoding)
 
@@ -1756,6 +1767,12 @@ cdef create_ogr_dataset_layer(
     the caller still needs to set up the layer definition, i.e. create the
     fields).
 
+    Parameters
+    ----------
+    encoding : str
+        Only used if `driver` is "ESRI Shapefile". If not None, it overrules the default
+        shapefile encoding, which is "UTF-8" in pyogrio.
+
     Returns
     -------
     bool :
@@ -1847,6 +1864,8 @@ cdef create_ogr_dataset_layer(
         if driver == 'ESRI Shapefile':
             # Fiona only sets encoding for shapefiles; other drivers do not support
             # encoding as an option.
+            if encoding is None:
+                encoding = "UTF-8"
             encoding_b = encoding.upper().encode('UTF-8')
             encoding_c = encoding_b
             layer_options = CSLSetNameValue(layer_options, "ENCODING", encoding_c)
@@ -1962,22 +1981,23 @@ def ogr_write(
         gdal_tz_offsets = {}
 
     ### Setup up dataset and layer
-    if not encoding:
-        encoding = locale.getpreferredencoding()
-
     layer_created = create_ogr_dataset_layer(
         path, layer, driver, crs, geometry_type, encoding,
         dataset_kwargs, layer_kwargs, append,
         dataset_metadata, layer_metadata,
         &ogr_dataset, &ogr_layer,
     )
 
+    # Now the dataset and layer have been created, we can properly determine the
+    # encoding. It is derived from the user, from the dataset capabilities / type,
+    # or from the system locale
+    encoding = encoding or detect_encoding(ogr_dataset, ogr_layer)
+
     ### Create the fields
     field_types = None
     if num_fields > 0:
         field_types = infer_field_types([field.dtype for field in field_data])
 
-    ### Create the fields
     if layer_created:
         for i in range(num_fields):
             field_type, field_subtype, width, precision = field_types[i]
@@ -2076,7 +2096,6 @@ def ogr_write(
                     OGR_F_SetFieldNull(ogr_feature, field_idx)
 
                 elif field_type == OFTString:
-                    # TODO: encode string using approach from _get_internal_encoding which checks layer capabilities
                     if (
                         field_value is None
                         or (isinstance(field_value, float) and isnan(field_value))
@@ -2088,7 +2107,7 @@ def ogr_write(
                             field_value = str(field_value)
 
                         try:
-                            value_b = field_value.encode("UTF-8")
+                            value_b = field_value.encode(encoding)
                             OGR_F_SetFieldString(ogr_feature, field_idx, value_b)
 
                         except AttributeError:

diff --git a/pyogrio/tests/test_geopandas_io.py b/pyogrio/tests/test_geopandas_io.py
@@ -62,6 +62,26 @@ def spatialite_available(path):
         return False
 
 
+@pytest.mark.parametrize("encoding", ["utf-8", "cp1252", None])
+def test_read_csv_encoding(tmp_path, encoding):
+    # Write csv test file. Depending on the os this will be written in a different
+    # encoding: for linux and macos this is utf-8, for windows it is cp1252.
+    csv_path = tmp_path / "test.csv"
+    with open(csv_path, "w", encoding=encoding) as csv:
+        csv.write("näme,city\n")
+        csv.write("Wilhelm Röntgen,Zürich\n")
+
+    # Read csv. The data should be read with the same default encoding as the csv file
+    # was written in, but should have been converted to utf-8 in the dataframe returned.
+    # Hence, the asserts below, with strings in utf-8, be OK.
+    df = read_dataframe(csv_path, encoding=encoding)
+
+    assert len(df) == 1
+    assert df.columns.tolist() == ["näme", "city"]
+    assert df.city.tolist() == ["Zürich"]
+    assert df.näme.tolist() == ["Wilhelm Röntgen"]
+
+
 def test_read_dataframe(naturalearth_lowres_all_ext):
     df = read_dataframe(naturalearth_lowres_all_ext)
 
@@ -792,6 +812,39 @@ def test_read_sql_dialect_sqlite_gpkg(naturalearth_lowres, use_arrow):
     assert df.iloc[0].geometry.area > area_canada
 
 
+@pytest.mark.parametrize("encoding", ["utf-8", "cp1252", None])
+def test_write_csv_encoding(tmp_path, encoding):
+    """Test if write_dataframe uses the default encoding correctly."""
+    # Write csv test file. Depending on the os this will be written in a different
+    # encoding: for linux and macos this is utf-8, for windows it is cp1252.
+    csv_path = tmp_path / "test.csv"
+
+    with open(csv_path, "w", encoding=encoding) as csv:
+        csv.write("näme,city\n")
+        csv.write("Wilhelm Röntgen,Zürich\n")
+
+    # Write csv test file with the same data using write_dataframe. It should use the
+    # same encoding as above.
+    df = pd.DataFrame({"näme": ["Wilhelm Röntgen"], "city": ["Zürich"]})
+    csv_pyogrio_path = tmp_path / "test_pyogrio.csv"
+    write_dataframe(df, csv_pyogrio_path, encoding=encoding)
+
+    # Check if the text files written both ways can be read again and give same result.
+    with open(csv_path, "r", encoding=encoding) as csv:
+        csv_str = csv.read()
+    with open(csv_pyogrio_path, "r", encoding=encoding) as csv_pyogrio:
+        csv_pyogrio_str = csv_pyogrio.read()
+    assert csv_str == csv_pyogrio_str
+
+    # Check if they files are binary identical, to be 100% sure they were written with
+    # the same encoding.
+    with open(csv_path, "rb") as csv:
+        csv_bytes = csv.read()
+    with open(csv_pyogrio_path, "rb") as csv_pyogrio:
+        csv_pyogrio_bytes = csv_pyogrio.read()
+    assert csv_bytes == csv_pyogrio_bytes
+
+
 @pytest.mark.parametrize("ext", ALL_EXTS)
 def test_write_dataframe(tmp_path, naturalearth_lowres, ext):
     input_gdf = read_dataframe(naturalearth_lowres)

diff --git a/pyogrio/tests/test_raw_io.py b/pyogrio/tests/test_raw_io.py
@@ -420,16 +420,24 @@ def test_read_return_only_fids(naturalearth_lowres):
     assert len(field_data) == 0
 
 
-def test_write(tmpdir, naturalearth_lowres):
+@pytest.mark.parametrize("encoding", [None, "ISO-8859-1"])
+def test_write_shp(tmpdir, naturalearth_lowres, encoding):
     meta, _, geometry, field_data = read(naturalearth_lowres)
 
     filename = os.path.join(str(tmpdir), "test.shp")
+    meta["encoding"] = encoding
     write(filename, geometry, field_data, **meta)
 
     assert os.path.exists(filename)
     for ext in (".dbf", ".prj"):
         assert os.path.exists(filename.replace(".shp", ext))
 
+    # We write shapefiles in UTF-8 by default on all platforms
+    expected_encoding = encoding if encoding is not None else "UTF-8"
+    with open(filename.replace(".shp", ".cpg")) as cpg_file:
+        result_encoding = cpg_file.read()
+        assert result_encoding == expected_encoding
+
 
 def test_write_gpkg(tmpdir, naturalearth_lowres):
     meta, _, geometry, field_data = read(naturalearth_lowres)