diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt index 1e9c402dac73e6..902ad3138140ba 100644 --- a/doc/source/whatsnew/v0.21.0.txt +++ b/doc/source/whatsnew/v0.21.0.txt @@ -827,6 +827,7 @@ Conversion - Bug in ``Timestamp.replace`` when replacing ``tzinfo`` around DST changes (:issue:`15683`) - Bug in ``Timedelta`` construction and arithmetic that would not propagate the ``Overflow`` exception (:issue:`17367`) - Bug in :meth:`~DataFrame.astype` converting to object dtype when passed extension type classes (`DatetimeTZDtype``, ``CategoricalDtype``) rather than instances. Now a ``TypeError`` is raised when a class is passed (:issue:`17780`). +- Bug in :meth:`to_numeric` in which elements were not always being coerced to numeric when ``errors='coerce'`` (:issue:`17007`, :issue:`17125`) Indexing ^^^^^^^^ diff --git a/pandas/_libs/src/inference.pyx b/pandas/_libs/src/inference.pyx index 7990fd3b1b5c91..b0a64e1ccc225e 100644 --- a/pandas/_libs/src/inference.pyx +++ b/pandas/_libs/src/inference.pyx @@ -165,20 +165,8 @@ cdef class Seen(object): two conflict cases was also detected. However, we are trying to force conversion to a numeric dtype. """ - if self.uint_ and (self.null_ or self.sint_): - if not self.coerce_numeric: - return True - - if self.null_: - msg = ("uint64 array detected, and such an " - "array cannot contain NaN.") - else: # self.sint_ = 1 - msg = ("uint64 and negative values detected. " - "Cannot safely return a numeric array " - "without truncating data.") - - raise ValueError(msg) - return False + return (self.uint_ and (self.null_ or self.sint_) + and not self.coerce_numeric) cdef inline saw_null(self): """ @@ -1103,10 +1091,17 @@ def maybe_convert_numeric(ndarray[object] values, set na_values, seen.saw_int(val) if val >= 0: - uints[i] = val + if val <= oUINT64_MAX: + uints[i] = val + else: + seen.float_ = True if val <= oINT64_MAX: ints[i] = val + + if seen.sint_ and seen.uint_: + seen.float_ = True + elif util.is_bool_object(val): floats[i] = uints[i] = ints[i] = bools[i] = val seen.bool_ = True @@ -1154,6 +1149,8 @@ def maybe_convert_numeric(ndarray[object] values, set na_values, uints[i] = as_int if as_int <= oINT64_MAX: ints[i] = as_int + + seen.float_ = seen.float_ or (seen.uint_ and seen.sint_) else: seen.float_ = True except (TypeError, ValueError) as e: diff --git a/pandas/core/tools/numeric.py b/pandas/core/tools/numeric.py index c584e29f682ddc..3d95bc21d4a03f 100644 --- a/pandas/core/tools/numeric.py +++ b/pandas/core/tools/numeric.py @@ -132,7 +132,7 @@ def to_numeric(arg, errors='raise', downcast=None): values = lib.maybe_convert_numeric(values, set(), coerce_numeric=coerce_numeric) - except Exception: + except Exception as e: if errors == 'raise': raise diff --git a/pandas/tests/dtypes/test_inference.py b/pandas/tests/dtypes/test_inference.py index 857f7a283aa951..be76c51d116ad6 100644 --- a/pandas/tests/dtypes/test_inference.py +++ b/pandas/tests/dtypes/test_inference.py @@ -39,6 +39,11 @@ from pandas.util import testing as tm +@pytest.fixture(params=[True, False], ids=lambda val: str(val)) +def coerce(request): + return request.param + + def test_is_sequence(): is_seq = inference.is_sequence assert (is_seq((1, 2))) @@ -340,44 +345,38 @@ def test_convert_numeric_uint64(self): exp = np.array([2**63], dtype=np.uint64) tm.assert_numpy_array_equal(lib.maybe_convert_numeric(arr, set()), exp) - def test_convert_numeric_uint64_nan(self): - msg = 'uint64 array detected' - cases = [(np.array([2**63, np.nan], dtype=object), set()), - (np.array([str(2**63), np.nan], dtype=object), set()), - (np.array([np.nan, 2**63], dtype=object), set()), - (np.array([np.nan, str(2**63)], dtype=object), set()), - (np.array([2**63, 2**63 + 1], dtype=object), set([2**63])), - (np.array([str(2**63), str(2**63 + 1)], - dtype=object), set([2**63]))] - - for coerce in (True, False): - for arr, na_values in cases: - if coerce: - with tm.assert_raises_regex(ValueError, msg): - lib.maybe_convert_numeric(arr, na_values, - coerce_numeric=coerce) - else: - tm.assert_numpy_array_equal(lib.maybe_convert_numeric( - arr, na_values), arr) - - def test_convert_numeric_int64_uint64(self): - msg = 'uint64 and negative values detected' - cases = [np.array([2**63, -1], dtype=object), - np.array([str(2**63), -1], dtype=object), - np.array([str(2**63), str(-1)], dtype=object), - np.array([-1, 2**63], dtype=object), - np.array([-1, str(2**63)], dtype=object), - np.array([str(-1), str(2**63)], dtype=object)] - - for coerce in (True, False): - for case in cases: - if coerce: - with tm.assert_raises_regex(ValueError, msg): - lib.maybe_convert_numeric(case, set(), - coerce_numeric=coerce) - else: - tm.assert_numpy_array_equal(lib.maybe_convert_numeric( - case, set()), case) + @pytest.mark.parametrize("arr,na_values", [ + (np.array([2**63, np.nan], dtype=object), set()), + (np.array([str(2**63), np.nan], dtype=object), set()), + (np.array([np.nan, 2**63], dtype=object), set()), + (np.array([np.nan, str(2**63)], dtype=object), set())]) + def test_convert_numeric_uint64_nan(self, coerce, arr, na_values): + expected = arr.astype(float) if coerce else arr.copy() + result = lib.maybe_convert_numeric(arr, na_values, + coerce_numeric=coerce) + tm.assert_almost_equal(result, expected) + + def test_convert_numeric_uint64_nan_values(self, coerce): + arr = np.array([2**63, 2**63 + 1], dtype=object) + na_values = set([2**63]) + + expected = (np.array([np.nan, 2**63 + 1], dtype=float) + if coerce else arr.copy()) + result = lib.maybe_convert_numeric(arr, na_values, + coerce_numeric=coerce) + tm.assert_almost_equal(result, expected) + + @pytest.mark.parametrize("case", [ + np.array([2**63, -1], dtype=object), + np.array([str(2**63), -1], dtype=object), + np.array([str(2**63), str(-1)], dtype=object), + np.array([-1, 2**63], dtype=object), + np.array([-1, str(2**63)], dtype=object), + np.array([str(-1), str(2**63)], dtype=object)]) + def test_convert_numeric_int64_uint64(self, case, coerce): + expected = case.astype(float) if coerce else case.copy() + result = lib.maybe_convert_numeric(case, set(), coerce_numeric=coerce) + tm.assert_almost_equal(result, expected) def test_maybe_convert_objects_uint64(self): # see gh-4471 diff --git a/pandas/tests/tools/test_numeric.py b/pandas/tests/tools/test_numeric.py index 1d13ba93ba7592..48f59c00e26a17 100644 --- a/pandas/tests/tools/test_numeric.py +++ b/pandas/tests/tools/test_numeric.py @@ -381,3 +381,20 @@ def test_downcast_limits(self): for dtype, downcast, min_max in dtype_downcast_min_max: series = pd.to_numeric(pd.Series(min_max), downcast=downcast) assert series.dtype == dtype + + def test_coerce_uint64_conflict(self): + # see gh-17007 and gh-17125 + # + # Still returns float despite the uint64-nan conflict, + # which would normally force the casting to object. + df = pd.DataFrame({"a": [200, 300, "", "NaN", 30000000000000000000]}) + expected = pd.Series([200, 300, np.nan, np.nan, + 30000000000000000000], dtype=float, name="a") + result = to_numeric(df["a"], errors="coerce") + tm.assert_series_equal(expected, result) + + s = pd.Series(["12345678901234567890", "1234567890", "ITEM"]) + expected = pd.Series([12345678901234567890, + 1234567890, np.nan], dtype=float) + result = to_numeric(s, errors="coerce") + tm.assert_series_equal(expected, result)