Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support pandas copy-on-write behaviour #8846

Merged
merged 6 commits into from
Mar 18, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 8 additions & 1 deletion xarray/core/variable.py
Original file line number Diff line number Diff line change
Expand Up @@ -209,7 +209,14 @@ def _possibly_convert_objects(values):
as_series = pd.Series(values.ravel(), copy=False)
if as_series.dtype.kind in "mM":
as_series = _as_nanosecond_precision(as_series)
return np.asarray(as_series).reshape(values.shape)
result = np.asarray(as_series).reshape(values.shape)
if not result.flags.writeable:
# GH8843, pandas copy-on-write mode creates read-only arrays by default
try:
result.flags.writeable = True
except ValueError:
result = result.copy()
return result


def _possibly_convert_datetime_or_timedelta_index(data):
Expand Down
12 changes: 11 additions & 1 deletion xarray/tests/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
from xarray.core.duck_array_ops import allclose_or_equiv # noqa: F401
from xarray.core.indexing import ExplicitlyIndexed
from xarray.core.options import set_options
from xarray.core.variable import IndexVariable
from xarray.testing import ( # noqa: F401
assert_chunks_equal,
assert_duckarray_allclose,
Expand Down Expand Up @@ -47,6 +48,15 @@
)


def assert_writeable(ds):
readonly = [
name
for name, var in ds.variables.items()
if not isinstance(var, IndexVariable) and not var.data.flags.writeable
]
assert not readonly, readonly


def _importorskip(
modname: str, minversion: str | None = None
) -> tuple[bool, pytest.MarkDecorator]:
Expand Down Expand Up @@ -326,7 +336,7 @@ def create_test_data(
numbers_values = np.random.randint(0, 3, _dims["dim3"], dtype="int64")
obj.coords["numbers"] = ("dim3", numbers_values)
obj.encoding = {"foo": "bar"}
assert all(obj.data.flags.writeable for obj in obj.variables.values())
assert_writeable(obj)
return obj


Expand Down
8 changes: 6 additions & 2 deletions xarray/tests/test_backends.py
Original file line number Diff line number Diff line change
Expand Up @@ -2605,7 +2605,9 @@
# overwrite a coordinate;
# for mode='a-', this will not get written to the store
# because it does not have the append_dim as a dim
ds_to_append.lon.data[:] = -999
lon = ds_to_append.lon.to_numpy().copy()
lon[:] = -999
ds_to_append["lon"] = lon
ds_to_append.to_zarr(
store_target, mode="a-", append_dim="time", **self.version_kwargs
)
Expand All @@ -2615,7 +2617,9 @@
# by default, mode="a" will overwrite all coordinates.
ds_to_append.to_zarr(store_target, append_dim="time", **self.version_kwargs)
actual = xr.open_dataset(store_target, engine="zarr", **self.version_kwargs)
original2.lon.data[:] = -999
lon = original2.lon.to_numpy().copy()
lon[:] = -999
original2["lon"] = lon
assert_identical(original2, actual)

@requires_dask
Expand Down Expand Up @@ -4513,7 +4517,7 @@
def test_deterministic_names(self) -> None:
with create_tmp_file() as tmp:
data = create_test_data()
data.to_netcdf(tmp)

Check failure on line 4520 in xarray/tests/test_backends.py

View workflow job for this annotation

GitHub Actions / macos-latest py3.11

TestDask.test_deterministic_names Failed: Timeout >180.0s
with open_mfdataset(tmp, combine="by_coords") as ds:
original_names = {k: v.data.name for k, v in ds.data_vars.items()}
with open_mfdataset(tmp, combine="by_coords") as ds:
Expand All @@ -4531,7 +4535,7 @@
computed = actual.compute()
assert not actual._in_memory
assert computed._in_memory
assert_allclose(actual, computed, decode_bytes=False)

Check failure on line 4538 in xarray/tests/test_backends.py

View workflow job for this annotation

GitHub Actions / macos-latest py3.11

TestDask.test_dataarray_compute Failed: Timeout >180.0s

def test_save_mfdataset_compute_false_roundtrip(self) -> None:
from dask.delayed import Delayed
Expand All @@ -4544,7 +4548,7 @@
datasets, [tmp1, tmp2], engine=self.engine, compute=False
)
assert isinstance(delayed_obj, Delayed)
delayed_obj.compute()

Check failure on line 4551 in xarray/tests/test_backends.py

View workflow job for this annotation

GitHub Actions / macos-latest py3.11

TestDask.test_save_mfdataset_compute_false_roundtrip Failed: Timeout >180.0s
with open_mfdataset(
[tmp1, tmp2], combine="nested", concat_dim="x"
) as actual:
Expand All @@ -4553,7 +4557,7 @@
def test_load_dataset(self) -> None:
with create_tmp_file() as tmp:
original = Dataset({"foo": ("x", np.random.randn(10))})
original.to_netcdf(tmp)

Check failure on line 4560 in xarray/tests/test_backends.py

View workflow job for this annotation

GitHub Actions / macos-latest py3.11

TestDask.test_load_dataset Failed: Timeout >180.0s
ds = load_dataset(tmp)
# this would fail if we used open_dataset instead of load_dataset
ds.to_netcdf(tmp)
Expand All @@ -4561,7 +4565,7 @@
def test_load_dataarray(self) -> None:
with create_tmp_file() as tmp:
original = Dataset({"foo": ("x", np.random.randn(10))})
original.to_netcdf(tmp)

Check failure on line 4568 in xarray/tests/test_backends.py

View workflow job for this annotation

GitHub Actions / macos-latest py3.11

TestDask.test_load_dataarray Failed: Timeout >180.0s
ds = load_dataarray(tmp)
# this would fail if we used open_dataarray instead of
# load_dataarray
Expand All @@ -4574,7 +4578,7 @@
def test_inline_array(self) -> None:
with create_tmp_file() as tmp:
original = Dataset({"foo": ("x", np.random.randn(10))})
original.to_netcdf(tmp)

Check failure on line 4581 in xarray/tests/test_backends.py

View workflow job for this annotation

GitHub Actions / macos-latest py3.11

TestDask.test_inline_array Failed: Timeout >180.0s
chunks = {"time": 10}

def num_graph_nodes(obj):
Expand Down Expand Up @@ -4627,7 +4631,7 @@
yield actual, expected

def test_cmp_local_file(self) -> None:
with self.create_datasets() as (actual, expected):

Check failure on line 4634 in xarray/tests/test_backends.py

View workflow job for this annotation

GitHub Actions / macos-latest py3.11

TestPydap.test_cmp_local_file Failed: Timeout >180.0s
assert_equal(actual, expected)

# global attributes should be global attributes on the dataset
Expand Down Expand Up @@ -4661,7 +4665,7 @@

def test_compatible_to_netcdf(self) -> None:
# make sure it can be saved as a netcdf
with self.create_datasets() as (actual, expected):

Check failure on line 4668 in xarray/tests/test_backends.py

View workflow job for this annotation

GitHub Actions / macos-latest py3.11

TestPydap.test_compatible_to_netcdf Failed: Timeout >180.0s
with create_tmp_file() as tmp_file:
actual.to_netcdf(tmp_file)
with open_dataset(tmp_file) as actual2:
Expand All @@ -4670,7 +4674,7 @@

@requires_dask
def test_dask(self) -> None:
with self.create_datasets(chunks={"j": 2}) as (actual, expected):

Check failure on line 4677 in xarray/tests/test_backends.py

View workflow job for this annotation

GitHub Actions / macos-latest py3.11

TestPydap.test_dask Failed: Timeout >180.0s
assert_equal(actual, expected)


Expand Down Expand Up @@ -4815,7 +4819,7 @@
ds, attrs = new_dataset_and_attrs()
attrs["test"] = "test"
with create_tmp_file() as tmp_file:
ds.to_netcdf(tmp_file)

Check failure on line 4822 in xarray/tests/test_backends.py

View workflow job for this annotation

GitHub Actions / macos-latest py3.11

TestValidateAttrs.test_validating_attrs Failed: Timeout >180.0s

ds, attrs = new_dataset_and_attrs()
attrs["test"] = {"a": 5}
Expand Down
57 changes: 21 additions & 36 deletions xarray/tests/test_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@
assert_equal,
assert_identical,
assert_no_warnings,
assert_writeable,
create_test_data,
has_cftime,
has_dask,
Expand Down Expand Up @@ -96,11 +97,11 @@ def create_append_test_data(seed=None) -> tuple[Dataset, Dataset, Dataset]:
nt2 = 2
time1 = pd.date_range("2000-01-01", periods=nt1)
time2 = pd.date_range("2000-02-01", periods=nt2)
string_var = np.array(["ae", "bc", "df"], dtype=object)
string_var = np.array(["a", "bc", "def"], dtype=object)
string_var_to_append = np.array(["asdf", "asdfg"], dtype=object)
string_var_fixed_length = np.array(["aa", "bb", "cc"], dtype="|S2")
string_var_fixed_length_to_append = np.array(["dd", "ee"], dtype="|S2")
unicode_var = ["áó", "áó", "áó"]
unicode_var = np.array(["áó", "áó", "áó"])
datetime_var = np.array(
["2019-01-01", "2019-01-02", "2019-01-03"], dtype="datetime64[s]"
)
Expand All @@ -119,17 +120,11 @@ def create_append_test_data(seed=None) -> tuple[Dataset, Dataset, Dataset]:
coords=[lat, lon, time1],
dims=["lat", "lon", "time"],
),
"string_var": xr.DataArray(string_var, coords=[time1], dims=["time"]),
"string_var_fixed_length": xr.DataArray(
string_var_fixed_length, coords=[time1], dims=["time"]
),
"unicode_var": xr.DataArray(
unicode_var, coords=[time1], dims=["time"]
).astype(np.str_),
"datetime_var": xr.DataArray(
datetime_var, coords=[time1], dims=["time"]
),
"bool_var": xr.DataArray(bool_var, coords=[time1], dims=["time"]),
"string_var": ("time", string_var),
"string_var_fixed_length": ("time", string_var_fixed_length),
"unicode_var": ("time", unicode_var),
"datetime_var": ("time", datetime_var),
"bool_var": ("time", bool_var),
}
)

Expand All @@ -140,21 +135,11 @@ def create_append_test_data(seed=None) -> tuple[Dataset, Dataset, Dataset]:
coords=[lat, lon, time2],
dims=["lat", "lon", "time"],
),
"string_var": xr.DataArray(
string_var_to_append, coords=[time2], dims=["time"]
),
"string_var_fixed_length": xr.DataArray(
string_var_fixed_length_to_append, coords=[time2], dims=["time"]
),
"unicode_var": xr.DataArray(
unicode_var[:nt2], coords=[time2], dims=["time"]
).astype(np.str_),
"datetime_var": xr.DataArray(
datetime_var_to_append, coords=[time2], dims=["time"]
),
"bool_var": xr.DataArray(
bool_var_to_append, coords=[time2], dims=["time"]
),
"string_var": ("time", string_var_to_append),
"string_var_fixed_length": ("time", string_var_fixed_length_to_append),
"unicode_var": ("time", unicode_var[:nt2]),
"datetime_var": ("time", datetime_var_to_append),
"bool_var": ("time", bool_var_to_append),
}
)

Expand All @@ -168,8 +153,9 @@ def create_append_test_data(seed=None) -> tuple[Dataset, Dataset, Dataset]:
}
)

assert all(objp.data.flags.writeable for objp in ds.variables.values())
assert all(objp.data.flags.writeable for objp in ds_to_append.variables.values())
assert_writeable(ds)
assert_writeable(ds_to_append)
assert_writeable(ds_with_new_var)
return ds, ds_to_append, ds_with_new_var


Expand All @@ -182,10 +168,8 @@ def make_datasets(data, data_to_append) -> tuple[Dataset, Dataset]:
ds_to_append = xr.Dataset(
{"temperature": (["time"], data_to_append)}, coords={"time": [0, 1, 2]}
)
assert all(objp.data.flags.writeable for objp in ds.variables.values())
assert all(
objp.data.flags.writeable for objp in ds_to_append.variables.values()
)
assert_writeable(ds)
assert_writeable(ds_to_append)
return ds, ds_to_append

u2_strings = ["ab", "cd", "ef"]
Expand Down Expand Up @@ -2964,10 +2948,11 @@ def test_copy_coords(self, deep, expected_orig) -> None:
name="value",
).to_dataset()
ds_cp = ds.copy(deep=deep)
ds_cp.coords["a"].data[0] = 999
new_a = np.array([999, 2])
ds_cp.coords["a"] = ds_cp.a.copy(data=new_a)

expected_cp = xr.DataArray(
xr.IndexVariable("a", np.array([999, 2])),
xr.IndexVariable("a", new_a),
coords={"a": [999, 2]},
dims=["a"],
)
Expand Down
9 changes: 6 additions & 3 deletions xarray/tests/test_missing.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,10 +122,13 @@ def test_interpolate_pd_compat(method, fill_value) -> None:
# for the numpy linear methods.
# see https://github.com/pandas-dev/pandas/issues/55144
# This aligns the pandas output with the xarray output
expected.values[pd.isnull(actual.values)] = np.nan
expected.values[actual.values == fill_value] = fill_value
fixed = expected.values.copy()
fixed[pd.isnull(actual.values)] = np.nan
fixed[actual.values == fill_value] = fill_value
else:
fixed = expected.values

np.testing.assert_allclose(actual.values, expected.values)
np.testing.assert_allclose(actual.values, fixed)


@requires_scipy
Expand Down
15 changes: 15 additions & 0 deletions xarray/tests/test_variable.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,21 @@ def var():
return Variable(dims=list("xyz"), data=np.random.rand(3, 4, 5))


@pytest.mark.parametrize(
"data",
[
np.array(["a", "bc", "def"], dtype=object),
np.array(["2019-01-01", "2019-01-02", "2019-01-03"], dtype="datetime64[ns]"),
],
)
def test_as_compatible_data_writeable(data):
pd.set_option("mode.copy_on_write", True)
# GH8843, ensure writeable arrays for data_vars even with
# pandas copy-on-write mode
assert as_compatible_data(data).flags.writeable
pd.reset_option("mode.copy_on_write")


class VariableSubclassobjects(NamedArraySubclassobjects, ABC):
@pytest.fixture
def target(self, data):
Expand Down
Loading