Skip to content

Commit

Permalink
REF: Block._astype defer to astype_nansafe in more cases (#38562)
Browse files Browse the repository at this point in the history
  • Loading branch information
jbrockmendel authored Dec 21, 2020
1 parent d3c52e4 commit 9a46a4b
Show file tree
Hide file tree
Showing 7 changed files with 54 additions and 30 deletions.
2 changes: 1 addition & 1 deletion pandas/core/arrays/datetimelike.py
Original file line number Diff line number Diff line change
Expand Up @@ -346,7 +346,7 @@ def astype(self, dtype, copy=True):
elif is_string_dtype(dtype) and not is_categorical_dtype(dtype):
if is_extension_array_dtype(dtype):
arr_cls = dtype.construct_array_type()
return arr_cls._from_sequence(self, dtype=dtype)
return arr_cls._from_sequence(self, dtype=dtype, copy=copy)
else:
return self._format_native_types()
elif is_integer_dtype(dtype):
Expand Down
9 changes: 7 additions & 2 deletions pandas/core/arrays/datetimes.py
Original file line number Diff line number Diff line change
Expand Up @@ -589,10 +589,15 @@ def astype(self, dtype, copy=True):

if is_datetime64_ns_dtype(dtype) and not is_dtype_equal(dtype, self.dtype):
# GH#18951: datetime64_ns dtype but not equal means different tz
# FIXME: this doesn't match DatetimeBlock.astype, xref GH#33401
new_tz = getattr(dtype, "tz", None)
if getattr(self.dtype, "tz", None) is None:
if self.tz is None:
return self.tz_localize(new_tz)
result = self.tz_convert(new_tz)
elif new_tz is None:
result = self.tz_convert("UTC").tz_localize(None)
else:
result = self.tz_convert(new_tz)

if copy:
result = result.copy()
if new_tz is None:
Expand Down
17 changes: 10 additions & 7 deletions pandas/core/dtypes/cast.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,6 @@
conversion,
iNaT,
ints_to_pydatetime,
ints_to_pytimedelta,
)
from pandas._libs.tslibs.timezones import tz_compare
from pandas._typing import AnyArrayLike, ArrayLike, Dtype, DtypeObj, Scalar
Expand Down Expand Up @@ -987,15 +986,21 @@ def astype_nansafe(
elif not isinstance(dtype, np.dtype):
raise ValueError("dtype must be np.dtype or ExtensionDtype")

if arr.dtype.kind in ["m", "M"] and (
issubclass(dtype.type, str) or dtype == object
):
from pandas.core.construction import ensure_wrapped_if_datetimelike

arr = ensure_wrapped_if_datetimelike(arr)
return arr.astype(dtype, copy=copy)

if issubclass(dtype.type, str):
return lib.ensure_string_array(
arr.ravel(), skipna=skipna, convert_na_value=False
).reshape(arr.shape)

elif is_datetime64_dtype(arr):
if is_object_dtype(dtype):
return ints_to_pydatetime(arr.view(np.int64))
elif dtype == np.int64:
if dtype == np.int64:
if isna(arr).any():
raise ValueError("Cannot convert NaT values to integer")
return arr.view(dtype)
Expand All @@ -1007,9 +1012,7 @@ def astype_nansafe(
raise TypeError(f"cannot astype a datetimelike from [{arr.dtype}] to [{dtype}]")

elif is_timedelta64_dtype(arr):
if is_object_dtype(dtype):
return ints_to_pytimedelta(arr.view(np.int64))
elif dtype == np.int64:
if dtype == np.int64:
if isna(arr).any():
raise ValueError("Cannot convert NaT values to integer")
return arr.view(dtype)
Expand Down
1 change: 1 addition & 0 deletions pandas/core/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -5857,6 +5857,7 @@ def astype(
elif is_extension_array_dtype(dtype) and self.ndim > 1:
# GH 18099/22869: columnwise conversion to extension dtype
# GH 24704: use iloc to handle duplicate column names
# TODO(EA2D): special case not needed with 2D EAs
results = [
self.iloc[:, i].astype(dtype, copy=copy)
for i in range(len(self.columns))
Expand Down
20 changes: 1 addition & 19 deletions pandas/core/internals/blocks.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@
)
from pandas.core.base import PandasObject
import pandas.core.common as com
from pandas.core.construction import array as pd_array, extract_array
from pandas.core.construction import extract_array
from pandas.core.indexers import (
check_setitem_lengths,
is_empty_indexer,
Expand Down Expand Up @@ -676,24 +676,6 @@ def _astype(self, dtype: DtypeObj, copy: bool) -> ArrayLike:
values = values.astype(dtype, copy=copy)

else:
if issubclass(dtype.type, str):
if values.dtype.kind in ["m", "M"]:
# use native type formatting for datetime/tz/timedelta
arr = pd_array(values)
# Note: in the case where dtype is an np.dtype, i.e. not
# StringDtype, this matches arr.astype(dtype), xref GH#36153
values = arr._format_native_types(na_rep="NaT")

elif is_object_dtype(dtype):
if values.dtype.kind in ["m", "M"]:
# Wrap in Timedelta/Timestamp
arr = pd_array(values)
values = arr.astype(object)
else:
values = values.astype(object)
# We still need to go through astype_nansafe for
# e.g. dtype = Sparse[object, 0]

values = astype_nansafe(values, dtype, copy=True)

return values
Expand Down
7 changes: 6 additions & 1 deletion pandas/tests/arrays/string_/test_string.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,7 +126,12 @@ def test_string_methods(input, method, dtype, request):
def test_astype_roundtrip(dtype, request):
if dtype == "arrow_string":
reason = "ValueError: Could not convert object to NumPy datetime"
mark = pytest.mark.xfail(reason=reason)
mark = pytest.mark.xfail(reason=reason, raises=ValueError)
request.node.add_marker(mark)
else:
mark = pytest.mark.xfail(
reason="GH#36153 casting from StringArray to dt64 fails", raises=ValueError
)
request.node.add_marker(mark)

ser = pd.Series(pd.date_range("2000", periods=12))
Expand Down
28 changes: 28 additions & 0 deletions pandas/tests/frame/methods/test_astype.py
Original file line number Diff line number Diff line change
Expand Up @@ -611,3 +611,31 @@ def test_astype_tz_object_conversion(self, tz):
# do real test: object dtype to a specified tz, different from construction tz.
result = result.astype({"tz": "datetime64[ns, Europe/London]"})
tm.assert_frame_equal(result, expected)

def test_astype_dt64_to_string(self, frame_or_series, tz_naive_fixture, request):
tz = tz_naive_fixture
if tz is None:
mark = pytest.mark.xfail(
reason="GH#36153 uses ndarray formatting instead of DTA formatting"
)
request.node.add_marker(mark)

dti = date_range("2016-01-01", periods=3, tz=tz)
dta = dti._data
dta[0] = NaT

obj = frame_or_series(dta)
result = obj.astype("string")

# Check that Series/DataFrame.astype matches DatetimeArray.astype
expected = frame_or_series(dta.astype("string"))
tm.assert_equal(result, expected)

item = result.iloc[0]
if frame_or_series is DataFrame:
item = item.iloc[0]
assert item is pd.NA

# For non-NA values, we should match what we get for non-EA str
alt = obj.astype(str)
assert np.all(alt.iloc[1:] == result.iloc[1:])

0 comments on commit 9a46a4b

Please sign in to comment.