Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ENH: add NA scalar for missing value indicator, use in StringArray. #29597

Merged
merged 25 commits into from
Dec 1, 2019
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
03f83bd
ENH: add NA scalar for missing value indicator
jorisvandenbossche Nov 12, 2019
c1797d5
add np.nan to arithmetic/comparison tests
jorisvandenbossche Nov 13, 2019
3339eaa
use id(self) for hash
jorisvandenbossche Nov 13, 2019
e9d4d6a
fix api test
jorisvandenbossche Nov 13, 2019
4450d2d
move to cython
jorisvandenbossche Nov 13, 2019
1849a23
add examples to isna/notna docstring
jorisvandenbossche Nov 14, 2019
c72e3ee
Use NA scalar in string dtype (#1)
TomAugspurger Nov 14, 2019
3a97782
Merge remote-tracking branch 'upstream/master' into NA-scalar
jorisvandenbossche Nov 14, 2019
2302661
fix doctest
jorisvandenbossche Nov 14, 2019
2ab592a
small edits
jorisvandenbossche Nov 14, 2019
018399e
fix NA in repr
jorisvandenbossche Nov 15, 2019
31290b9
Merge remote-tracking branch 'upstream/master' into NA-scalar
TomAugspurger Nov 19, 2019
33fd3e0
remove redundant test
TomAugspurger Nov 19, 2019
289c885
remove dead code
TomAugspurger Nov 19, 2019
22de7cd
Merge remote-tracking branch 'upstream/master' into NA-scalar
jorisvandenbossche Nov 20, 2019
f8208db
fix divmod
jorisvandenbossche Nov 21, 2019
371eeeb
Merge remote-tracking branch 'upstream/master' into NA-scalar
jorisvandenbossche Nov 21, 2019
1cadeda
Merge remote-tracking branch 'upstream/master' into NA-scalar
jorisvandenbossche Nov 25, 2019
1fcf4b7
NA -> C_NA
jorisvandenbossche Nov 25, 2019
f6798e5
start some docs
jorisvandenbossche Nov 26, 2019
14c1434
futher doc updates
jorisvandenbossche Nov 27, 2019
788a2c2
Merge remote-tracking branch 'upstream/master' into NA-scalar
jorisvandenbossche Nov 27, 2019
1bcbab2
doc fixup
jorisvandenbossche Nov 27, 2019
775cdfb
Merge remote-tracking branch 'upstream/master' into NA-scalar
jorisvandenbossche Nov 27, 2019
589a961
further doc updates
jorisvandenbossche Nov 28, 2019
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions pandas/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,7 @@
DatetimeTZDtype,
StringDtype,
# missing
NA,
isna,
isnull,
notna,
Expand Down
4 changes: 2 additions & 2 deletions pandas/_libs/lib.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -58,10 +58,9 @@ from pandas._libs.tslibs.timedeltas cimport convert_to_timedelta64
from pandas._libs.tslibs.timezones cimport get_timezone, tz_compare

from pandas._libs.missing cimport (
checknull, isnaobj, is_null_datetime64, is_null_timedelta64, is_null_period
checknull, isnaobj, is_null_datetime64, is_null_timedelta64, is_null_period, C_NA
)


# constants that will be compared to potentially arbitrarily large
# python int
cdef:
Expand Down Expand Up @@ -161,6 +160,7 @@ def is_scalar(val: object) -> bool:
or PyTime_Check(val)
# We differ from numpy, which claims that None is not scalar;
# see np.isscalar
or val is C_NA
or val is None
or isinstance(val, (Fraction, Number))
or util.is_period_object(val)
Expand Down
5 changes: 5 additions & 0 deletions pandas/_libs/missing.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -9,3 +9,8 @@ cpdef ndarray[uint8_t] isnaobj(ndarray arr)
cdef bint is_null_datetime64(v)
cdef bint is_null_timedelta64(v)
cdef bint is_null_period(v)

cdef class C_NAType:
pass

cdef C_NAType C_NA
125 changes: 124 additions & 1 deletion pandas/_libs/missing.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ cpdef bint checknull(object val):
The difference between `checknull` and `checknull_old` is that `checknull`
does *not* consider INF or NEGINF to be NA.
"""
return is_null_datetimelike(val, inat_is_null=False)
return val is C_NA or is_null_datetimelike(val, inat_is_null=False)
jreback marked this conversation as resolved.
Show resolved Hide resolved


cpdef bint checknull_old(object val):
Expand Down Expand Up @@ -278,3 +278,126 @@ cdef inline bint is_null_period(v):
# determine if we have a null for a Period (or integer versions),
# excluding np.datetime64('nat') and np.timedelta64('nat')
return checknull_with_nat(v)


# -----------------------------------------------------------------------------
# Implementation of NA singleton


def _create_binary_propagating_op(name):
import numbers
jorisvandenbossche marked this conversation as resolved.
Show resolved Hide resolved

def method(self, other):
if isinstance(other, numbers.Number) or other is NA or isinstance(other, str):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you put the numbers.Number check last, as it will be least performant

return NA

return NotImplemented

method.__name__ = name
return method


def _create_unary_propagating_op(name):
def method(self):
return NA

method.__name__ = name
jreback marked this conversation as resolved.
Show resolved Hide resolved
return method


cdef class C_NAType:
pass


class NAType(C_NAType):
"""
NA ("not available") missing value indicator.
"""
jorisvandenbossche marked this conversation as resolved.
Show resolved Hide resolved

_instance = None

def __new__(cls, *args, **kwargs):
if NAType._instance is None:
NAType._instance = C_NAType.__new__(cls, *args, **kwargs)
return NAType._instance

def __repr__(self) -> str:
return "NA"

def __str__(self) -> str:
return "NA"

def __bool__(self):
raise TypeError("boolean value of NA is ambiguous")
jorisvandenbossche marked this conversation as resolved.
Show resolved Hide resolved

def __hash__(self):
return id(self)

# Binary arithmetic and comparison ops -> propagate

__add__ = _create_binary_propagating_op("__add__")
__radd__ = _create_binary_propagating_op("__radd__")
__sub__ = _create_binary_propagating_op("__sub__")
__rsub__ = _create_binary_propagating_op("__rsub__")
__mul__ = _create_binary_propagating_op("__mul__")
__rmul__ = _create_binary_propagating_op("__rmul__")
__matmul__ = _create_binary_propagating_op("__matmul__")
__rmatmul__ = _create_binary_propagating_op("__rmatmul__")
__truediv__ = _create_binary_propagating_op("__truediv__")
__rtruediv__ = _create_binary_propagating_op("__rtruediv__")
__floordiv__ = _create_binary_propagating_op("__floordiv__")
__rfloordiv__ = _create_binary_propagating_op("__rfloordiv__")
__mod__ = _create_binary_propagating_op("__mod__")
__rmod__ = _create_binary_propagating_op("__rmod__")
__divmod__ = _create_binary_propagating_op("__divmod__")
__rdivmod__ = _create_binary_propagating_op("__rdivmod__")
__pow__ = _create_binary_propagating_op("__pow__")
__rpow__ = _create_binary_propagating_op("__rpow__")
# __lshift__ and __rshift__ are not implemented

__eq__ = _create_binary_propagating_op("__eq__")
__ne__ = _create_binary_propagating_op("__ne__")
__le__ = _create_binary_propagating_op("__le__")
__lt__ = _create_binary_propagating_op("__lt__")
__gt__ = _create_binary_propagating_op("__gt__")
__ge__ = _create_binary_propagating_op("__ge__")

# Unary ops

__neg__ = _create_unary_propagating_op("__neg__")
__pos__ = _create_unary_propagating_op("__pos__")
__abs__ = _create_unary_propagating_op("__abs__")
__invert__ = _create_unary_propagating_op("__invert__")

# Logical ops using Kleene logic

def __and__(self, other):
if other is False:
return False
elif other is True or other is NA:
return NA
else:
return NotImplemented

__rand__ = __and__

def __or__(self, other):
if other is True:
return True
elif other is False or other is NA:
return NA
else:
return NotImplemented

__ror__ = __or__

def __xor__(self, other):
if other is False or other is True or other is NA:
return NA
jreback marked this conversation as resolved.
Show resolved Hide resolved
return NotImplemented

__rxor__ = __xor__


C_NA = NAType() # C-visible
NA = C_NA # Python-visible
2 changes: 2 additions & 0 deletions pandas/core/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,3 +56,5 @@

# DataFrame needs to be imported after NamedAgg to avoid a circular import
from pandas.core.frame import DataFrame # isort:skip

from pandas._libs.missing import NA
2 changes: 1 addition & 1 deletion pandas/tests/api/test_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ class TestPDApi(Base):
deprecated_modules = [] # type: List[str]

# misc
misc = ["IndexSlice", "NaT"]
misc = ["IndexSlice", "NaT", "NA"]

# top-level classes
classes = [
Expand Down
128 changes: 128 additions & 0 deletions pandas/tests/scalar/test_na_scalar.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,128 @@
import numpy as np
import pytest

from pandas._libs.missing import NA

from pandas.core.dtypes.common import is_scalar

import pandas as pd
import pandas.util.testing as tm


def test_singleton():
assert NA is NA
new_NA = type(NA)()
assert new_NA is NA


def test_repr():
assert repr(NA) == "NA"
assert str(NA) == "NA"


def test_truthiness():
with pytest.raises(TypeError):
bool(NA)

with pytest.raises(TypeError):
not NA


def test_hashable():
assert hash(NA) == hash(NA)
d = {NA: "test"}
assert d[NA] == "test"


def test_arithmetic_ops(all_arithmetic_functions):
jorisvandenbossche marked this conversation as resolved.
Show resolved Hide resolved
op = all_arithmetic_functions

for other in [NA, 1, 1.0, "a", np.int64(1), np.nan]:
if op.__name__ == "rmod" and isinstance(other, str):
continue
assert op(NA, other) is NA
jorisvandenbossche marked this conversation as resolved.
Show resolved Hide resolved


def test_comparison_ops():

for other in [NA, 1, 1.0, "a", np.int64(1), np.nan]:
assert (NA == other) is NA
assert (NA != other) is NA
assert (NA > other) is NA
assert (NA >= other) is NA
assert (NA < other) is NA
assert (NA <= other) is NA

if isinstance(other, np.int64):
# for numpy scalars we get a deprecation warning and False as result
# for equality or error for larger/lesser than
continue
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So numpy scalars we don't have full control over, so this means that if they are the left operand, we get some other behaviour:

In [27]: np.int64(1) == pd.NA 
/home/joris/miniconda3/envs/dev/bin/ipython:1: DeprecationWarning: elementwise comparison failed; this will raise an error in the future.
  #!/home/joris/miniconda3/envs/dev/bin/python
Out[27]: False

In [28]: pd.NA == np.int64(1) 
Out[28]: NA

In [29]: np.int64(1) < pd.NA 
---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-29-87134fac2734> in <module>
----> 1 np.int64(1) < pd.NA

~/scipy/pandas/pandas/core/na_scalar.py in __bool__(self)
     37 
     38     def __bool__(self):
---> 39         raise TypeError("boolean value of NA is ambiguous")
     40 
     41     def __hash__(self):

TypeError: boolean value of NA is ambiguous

In [30]: pd.NA > np.int64(1) 
Out[30]: NA

(for the first case, not sure what the behaviour will be once the change in numpy is done)


assert (other == NA) is NA
assert (other != NA) is NA
assert (other > NA) is NA
assert (other >= NA) is NA
assert (other < NA) is NA
assert (other <= NA) is NA


def test_unary_ops():
assert +NA is NA
assert -NA is NA
assert abs(NA) is NA
assert ~NA is NA


def test_logical_and():

assert NA & True is NA
assert True & NA is NA
assert NA & False is False
assert False & NA is False
assert NA & NA is NA

with pytest.raises(TypeError):
NA & 5


def test_logical_or():

assert NA | True is True
assert True | NA is True
assert NA | False is NA
assert False | NA is NA
assert NA | NA is NA

with pytest.raises(TypeError):
NA | 5


def test_logical_xor():

assert NA ^ True is NA
assert True ^ NA is NA
assert NA ^ False is NA
assert False ^ NA is NA
assert NA ^ NA is NA

with pytest.raises(TypeError):
NA ^ 5


def test_logical_not():
assert ~NA is NA


def test_is_scalar():
assert is_scalar(NA) is True


def test_isna():
assert pd.isna(NA) is True
assert pd.notna(NA) is False


def test_series_isna():
s = pd.Series([1, NA], dtype=object)
expected = pd.Series([False, True])
tm.assert_series_equal(s.isna(), expected)