REF: Block._astype defer to astype_nansafe in more cases (#38562)

pandas-dev · Dec 21, 2020 · 9a46a4b · 9a46a4b
1 parent d3c52e4
commit 9a46a4b
Show file tree

Hide file tree

Showing 7 changed files with 54 additions and 30 deletions.
diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py
@@ -346,7 +346,7 @@ def astype(self, dtype, copy=True):
         elif is_string_dtype(dtype) and not is_categorical_dtype(dtype):
             if is_extension_array_dtype(dtype):
                 arr_cls = dtype.construct_array_type()
-                return arr_cls._from_sequence(self, dtype=dtype)
+                return arr_cls._from_sequence(self, dtype=dtype, copy=copy)
             else:
                 return self._format_native_types()
         elif is_integer_dtype(dtype):

diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py
@@ -589,10 +589,15 @@ def astype(self, dtype, copy=True):
 
         if is_datetime64_ns_dtype(dtype) and not is_dtype_equal(dtype, self.dtype):
             # GH#18951: datetime64_ns dtype but not equal means different tz
+            # FIXME: this doesn't match DatetimeBlock.astype, xref GH#33401
             new_tz = getattr(dtype, "tz", None)
-            if getattr(self.dtype, "tz", None) is None:
+            if self.tz is None:
                 return self.tz_localize(new_tz)
-            result = self.tz_convert(new_tz)
+            elif new_tz is None:
+                result = self.tz_convert("UTC").tz_localize(None)
+            else:
+                result = self.tz_convert(new_tz)
+
             if copy:
                 result = result.copy()
             if new_tz is None:

diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py
@@ -30,7 +30,6 @@
     conversion,
     iNaT,
     ints_to_pydatetime,
-    ints_to_pytimedelta,
 )
 from pandas._libs.tslibs.timezones import tz_compare
 from pandas._typing import AnyArrayLike, ArrayLike, Dtype, DtypeObj, Scalar
@@ -987,15 +986,21 @@ def astype_nansafe(
     elif not isinstance(dtype, np.dtype):
         raise ValueError("dtype must be np.dtype or ExtensionDtype")
 
+    if arr.dtype.kind in ["m", "M"] and (
+        issubclass(dtype.type, str) or dtype == object
+    ):
+        from pandas.core.construction import ensure_wrapped_if_datetimelike
+
+        arr = ensure_wrapped_if_datetimelike(arr)
+        return arr.astype(dtype, copy=copy)
+
     if issubclass(dtype.type, str):
         return lib.ensure_string_array(
             arr.ravel(), skipna=skipna, convert_na_value=False
         ).reshape(arr.shape)
 
     elif is_datetime64_dtype(arr):
-        if is_object_dtype(dtype):
-            return ints_to_pydatetime(arr.view(np.int64))
-        elif dtype == np.int64:
+        if dtype == np.int64:
             if isna(arr).any():
                 raise ValueError("Cannot convert NaT values to integer")
             return arr.view(dtype)
@@ -1007,9 +1012,7 @@ def astype_nansafe(
         raise TypeError(f"cannot astype a datetimelike from [{arr.dtype}] to [{dtype}]")
 
     elif is_timedelta64_dtype(arr):
-        if is_object_dtype(dtype):
-            return ints_to_pytimedelta(arr.view(np.int64))
-        elif dtype == np.int64:
+        if dtype == np.int64:
             if isna(arr).any():
                 raise ValueError("Cannot convert NaT values to integer")
             return arr.view(dtype)

diff --git a/pandas/core/generic.py b/pandas/core/generic.py
@@ -5857,6 +5857,7 @@ def astype(
         elif is_extension_array_dtype(dtype) and self.ndim > 1:
             # GH 18099/22869: columnwise conversion to extension dtype
             # GH 24704: use iloc to handle duplicate column names
+            # TODO(EA2D): special case not needed with 2D EAs
             results = [
                 self.iloc[:, i].astype(dtype, copy=copy)
                 for i in range(len(self.columns))

diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py
@@ -75,7 +75,7 @@
 )
 from pandas.core.base import PandasObject
 import pandas.core.common as com
-from pandas.core.construction import array as pd_array, extract_array
+from pandas.core.construction import extract_array
 from pandas.core.indexers import (
     check_setitem_lengths,
     is_empty_indexer,
@@ -676,24 +676,6 @@ def _astype(self, dtype: DtypeObj, copy: bool) -> ArrayLike:
             values = values.astype(dtype, copy=copy)
 
         else:
-            if issubclass(dtype.type, str):
-                if values.dtype.kind in ["m", "M"]:
-                    # use native type formatting for datetime/tz/timedelta
-                    arr = pd_array(values)
-                    # Note: in the case where dtype is an np.dtype, i.e. not
-                    #  StringDtype, this matches arr.astype(dtype), xref GH#36153
-                    values = arr._format_native_types(na_rep="NaT")
-
-            elif is_object_dtype(dtype):
-                if values.dtype.kind in ["m", "M"]:
-                    # Wrap in Timedelta/Timestamp
-                    arr = pd_array(values)
-                    values = arr.astype(object)
-                else:
-                    values = values.astype(object)
-                # We still need to go through astype_nansafe for
-                #  e.g. dtype = Sparse[object, 0]
-
             values = astype_nansafe(values, dtype, copy=True)
 
         return values

diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py
@@ -126,7 +126,12 @@ def test_string_methods(input, method, dtype, request):
 def test_astype_roundtrip(dtype, request):
     if dtype == "arrow_string":
         reason = "ValueError: Could not convert object to NumPy datetime"
-        mark = pytest.mark.xfail(reason=reason)
+        mark = pytest.mark.xfail(reason=reason, raises=ValueError)
+        request.node.add_marker(mark)
+    else:
+        mark = pytest.mark.xfail(
+            reason="GH#36153 casting from StringArray to dt64 fails", raises=ValueError
+        )
         request.node.add_marker(mark)
 
     ser = pd.Series(pd.date_range("2000", periods=12))

diff --git a/pandas/tests/frame/methods/test_astype.py b/pandas/tests/frame/methods/test_astype.py
@@ -611,3 +611,31 @@ def test_astype_tz_object_conversion(self, tz):
         # do real test: object dtype to a specified tz, different from construction tz.
         result = result.astype({"tz": "datetime64[ns, Europe/London]"})
         tm.assert_frame_equal(result, expected)
+
+    def test_astype_dt64_to_string(self, frame_or_series, tz_naive_fixture, request):
+        tz = tz_naive_fixture
+        if tz is None:
+            mark = pytest.mark.xfail(
+                reason="GH#36153 uses ndarray formatting instead of DTA formatting"
+            )
+            request.node.add_marker(mark)
+
+        dti = date_range("2016-01-01", periods=3, tz=tz)
+        dta = dti._data
+        dta[0] = NaT
+
+        obj = frame_or_series(dta)
+        result = obj.astype("string")
+
+        # Check that Series/DataFrame.astype matches DatetimeArray.astype
+        expected = frame_or_series(dta.astype("string"))
+        tm.assert_equal(result, expected)
+
+        item = result.iloc[0]
+        if frame_or_series is DataFrame:
+            item = item.iloc[0]
+        assert item is pd.NA
+
+        # For non-NA values, we should match what we get for non-EA str
+        alt = obj.astype(str)
+        assert np.all(alt.iloc[1:] == result.iloc[1:])