From dc90ce6a5ad0e1a7edd7c799a6cacf03ee0913d0 Mon Sep 17 00:00:00 2001 From: Brock Date: Sun, 13 Dec 2020 10:29:57 -0800 Subject: [PATCH 1/3] REF: roll DatetimeBlock.astype into Block._astype --- pandas/core/arrays/datetimes.py | 9 +++++++-- pandas/core/generic.py | 1 + pandas/core/internals/blocks.py | 2 +- 3 files changed, 9 insertions(+), 3 deletions(-) diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py index 8c94a1a080dca..359c17d61239a 100644 --- a/pandas/core/arrays/datetimes.py +++ b/pandas/core/arrays/datetimes.py @@ -589,10 +589,15 @@ def astype(self, dtype, copy=True): if is_datetime64_ns_dtype(dtype) and not is_dtype_equal(dtype, self.dtype): # GH#18951: datetime64_ns dtype but not equal means different tz + # FIXME: this doesn't match DatetimeBlock.astype, xref GH#33401 new_tz = getattr(dtype, "tz", None) - if getattr(self.dtype, "tz", None) is None: + if self.tz is None: return self.tz_localize(new_tz) - result = self.tz_convert(new_tz) + elif new_tz is None: + result = self.tz_convert("UTC").tz_localize(None) + else: + result = self.tz_convert(new_tz) + if copy: result = result.copy() if new_tz is None: diff --git a/pandas/core/generic.py b/pandas/core/generic.py index f9aa5ca9e8ea9..e9228dde9d546 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -5852,6 +5852,7 @@ def astype( elif is_extension_array_dtype(dtype) and self.ndim > 1: # GH 18099/22869: columnwise conversion to extension dtype # GH 24704: use iloc to handle duplicate column names + # TODO(EA2D): special case not needed with 2D EAs results = [ self.iloc[:, i].astype(dtype, copy=copy) for i in range(len(self.columns)) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index b9558daf05ad2..1da2b1482f686 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -654,7 +654,7 @@ def _astype(self, dtype: DtypeObj, copy: bool) -> ArrayLike: return Categorical(values, dtype=dtype) - elif is_datetime64tz_dtype(dtype) and is_datetime64_dtype(values.dtype): + if is_datetime64tz_dtype(dtype) and is_datetime64_dtype(values.dtype): # if we are passed a datetime64[ns, tz] if copy: # this should be the only copy From 59cb93e698ea1ee4fb4cc891c8bbef627c05f7df Mon Sep 17 00:00:00 2001 From: Brock Date: Tue, 15 Dec 2020 08:46:03 -0800 Subject: [PATCH 2/3] Make Series[dt64].astype(string) match DTA.astype(string), astype_nansafe --- pandas/core/arrays/datetimelike.py | 2 +- pandas/core/dtypes/cast.py | 8 +++++++ pandas/core/internals/blocks.py | 12 ++++++---- pandas/tests/arrays/string_/test_string.py | 7 +++++- pandas/tests/frame/methods/test_astype.py | 28 ++++++++++++++++++++++ 5 files changed, 50 insertions(+), 7 deletions(-) diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index be9864731842d..3e21a7a242b46 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -346,7 +346,7 @@ def astype(self, dtype, copy=True): elif is_string_dtype(dtype) and not is_categorical_dtype(dtype): if is_extension_array_dtype(dtype): arr_cls = dtype.construct_array_type() - return arr_cls._from_sequence(self, dtype=dtype) + return arr_cls._from_sequence(self, dtype=dtype, copy=copy) else: return self._format_native_types() elif is_integer_dtype(dtype): diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 63445d0e1598d..7bc864279e0a5 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -980,6 +980,14 @@ def astype_nansafe( elif not isinstance(dtype, np.dtype): raise ValueError("dtype must be np.dtype or ExtensionDtype") + if arr.dtype.kind in ["m", "M"] and ( + issubclass(dtype.type, str) or dtype == object + ): + from pandas.core.construction import ensure_wrapped_if_datetimelike + + arr = ensure_wrapped_if_datetimelike(arr) + return arr.astype(dtype, copy=copy) + if issubclass(dtype.type, str): return lib.ensure_string_array( arr.ravel(), skipna=skipna, convert_na_value=False diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 1da2b1482f686..9339a9197bae8 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -672,14 +672,17 @@ def _astype(self, dtype: DtypeObj, copy: bool) -> ArrayLike: if isinstance(values, ExtensionArray): values = values.astype(dtype, copy=copy) + elif isinstance(dtype, ExtensionDtype): + # same thing we do in astype_nansafe + cls = dtype.construct_array_type() + return cls._from_sequence(values, dtype=dtype, copy=copy) + else: if issubclass(dtype.type, str): if values.dtype.kind in ["m", "M"]: # use native type formatting for datetime/tz/timedelta arr = pd_array(values) - # Note: in the case where dtype is an np.dtype, i.e. not - # StringDtype, this matches arr.astype(dtype), xref GH#36153 - values = arr._format_native_types(na_rep="NaT") + return arr.astype(dtype) elif is_object_dtype(dtype): if values.dtype.kind in ["m", "M"]: @@ -688,8 +691,7 @@ def _astype(self, dtype: DtypeObj, copy: bool) -> ArrayLike: values = arr.astype(object) else: values = values.astype(object) - # We still need to go through astype_nansafe for - # e.g. dtype = Sparse[object, 0] + return values # astype_nansafe works with 1-d only vals1d = values.ravel() diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py index 7cc032e61e989..5365929213503 100644 --- a/pandas/tests/arrays/string_/test_string.py +++ b/pandas/tests/arrays/string_/test_string.py @@ -126,7 +126,12 @@ def test_string_methods(input, method, dtype, request): def test_astype_roundtrip(dtype, request): if dtype == "arrow_string": reason = "ValueError: Could not convert object to NumPy datetime" - mark = pytest.mark.xfail(reason=reason) + mark = pytest.mark.xfail(reason=reason, raises=ValueError) + request.node.add_marker(mark) + else: + mark = pytest.mark.xfail( + reason="GH#36153 casting from StringArray to dt64 fails", raises=ValueError + ) request.node.add_marker(mark) ser = pd.Series(pd.date_range("2000", periods=12)) diff --git a/pandas/tests/frame/methods/test_astype.py b/pandas/tests/frame/methods/test_astype.py index 54559400e3510..3c65551aafd0f 100644 --- a/pandas/tests/frame/methods/test_astype.py +++ b/pandas/tests/frame/methods/test_astype.py @@ -611,3 +611,31 @@ def test_astype_tz_object_conversion(self, tz): # do real test: object dtype to a specified tz, different from construction tz. result = result.astype({"tz": "datetime64[ns, Europe/London]"}) tm.assert_frame_equal(result, expected) + + def test_astype_dt64_to_string(self, frame_or_series, tz_naive_fixture, request): + tz = tz_naive_fixture + if tz is None: + mark = pytest.mark.xfail( + reason="GH#36153 uses ndarray formatting instead of DTA formatting" + ) + request.node.add_marker(mark) + + dti = date_range("2016-01-01", periods=3, tz=tz) + dta = dti._data + dta[0] = NaT + + obj = frame_or_series(dta) + result = obj.astype("string") + + # Check that Series/DataFrame.astype matches DatetimeArray.astype + expected = frame_or_series(dta.astype("string")) + tm.assert_equal(result, expected) + + item = result.iloc[0] + if frame_or_series is DataFrame: + item = item.iloc[0] + assert item is pd.NA + + # For non-NA values, we should match what we get for non-EA str + alt = obj.astype(str) + assert np.all(alt.iloc[1:] == result.iloc[1:]) From 63c8fdf93b1c5c1f812392410205ba92e440e298 Mon Sep 17 00:00:00 2001 From: Brock Date: Tue, 15 Dec 2020 10:07:21 -0800 Subject: [PATCH 3/3] REF: Block._astype call astype_nansafe more --- pandas/core/dtypes/cast.py | 9 ++------- pandas/core/internals/blocks.py | 24 ++---------------------- 2 files changed, 4 insertions(+), 29 deletions(-) diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 7bc864279e0a5..1b89b68f80432 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -30,7 +30,6 @@ conversion, iNaT, ints_to_pydatetime, - ints_to_pytimedelta, ) from pandas._libs.tslibs.timezones import tz_compare from pandas._typing import AnyArrayLike, ArrayLike, Dtype, DtypeObj, Scalar @@ -994,9 +993,7 @@ def astype_nansafe( ).reshape(arr.shape) elif is_datetime64_dtype(arr): - if is_object_dtype(dtype): - return ints_to_pydatetime(arr.view(np.int64)) - elif dtype == np.int64: + if dtype == np.int64: if isna(arr).any(): raise ValueError("Cannot convert NaT values to integer") return arr.view(dtype) @@ -1008,9 +1005,7 @@ def astype_nansafe( raise TypeError(f"cannot astype a datetimelike from [{arr.dtype}] to [{dtype}]") elif is_timedelta64_dtype(arr): - if is_object_dtype(dtype): - return ints_to_pytimedelta(arr.view(np.int64)) - elif dtype == np.int64: + if dtype == np.int64: if isna(arr).any(): raise ValueError("Cannot convert NaT values to integer") return arr.view(dtype) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 9339a9197bae8..f1c2fa847ffc2 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -68,7 +68,7 @@ ) from pandas.core.base import PandasObject import pandas.core.common as com -from pandas.core.construction import array as pd_array, extract_array +from pandas.core.construction import extract_array from pandas.core.indexers import ( check_setitem_lengths, is_empty_indexer, @@ -654,7 +654,7 @@ def _astype(self, dtype: DtypeObj, copy: bool) -> ArrayLike: return Categorical(values, dtype=dtype) - if is_datetime64tz_dtype(dtype) and is_datetime64_dtype(values.dtype): + elif is_datetime64tz_dtype(dtype) and is_datetime64_dtype(values.dtype): # if we are passed a datetime64[ns, tz] if copy: # this should be the only copy @@ -672,27 +672,7 @@ def _astype(self, dtype: DtypeObj, copy: bool) -> ArrayLike: if isinstance(values, ExtensionArray): values = values.astype(dtype, copy=copy) - elif isinstance(dtype, ExtensionDtype): - # same thing we do in astype_nansafe - cls = dtype.construct_array_type() - return cls._from_sequence(values, dtype=dtype, copy=copy) - else: - if issubclass(dtype.type, str): - if values.dtype.kind in ["m", "M"]: - # use native type formatting for datetime/tz/timedelta - arr = pd_array(values) - return arr.astype(dtype) - - elif is_object_dtype(dtype): - if values.dtype.kind in ["m", "M"]: - # Wrap in Timedelta/Timestamp - arr = pd_array(values) - values = arr.astype(object) - else: - values = values.astype(object) - return values - # astype_nansafe works with 1-d only vals1d = values.ravel() values = astype_nansafe(vals1d, dtype, copy=True)