Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ENH: add NA scalar for missing value indicator, use in StringArray. #29597

Merged
merged 25 commits into from
Dec 1, 2019
Merged
Show file tree
Hide file tree
Changes from 17 commits
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
03f83bd
ENH: add NA scalar for missing value indicator
jorisvandenbossche Nov 12, 2019
c1797d5
add np.nan to arithmetic/comparison tests
jorisvandenbossche Nov 13, 2019
3339eaa
use id(self) for hash
jorisvandenbossche Nov 13, 2019
e9d4d6a
fix api test
jorisvandenbossche Nov 13, 2019
4450d2d
move to cython
jorisvandenbossche Nov 13, 2019
1849a23
add examples to isna/notna docstring
jorisvandenbossche Nov 14, 2019
c72e3ee
Use NA scalar in string dtype (#1)
TomAugspurger Nov 14, 2019
3a97782
Merge remote-tracking branch 'upstream/master' into NA-scalar
jorisvandenbossche Nov 14, 2019
2302661
fix doctest
jorisvandenbossche Nov 14, 2019
2ab592a
small edits
jorisvandenbossche Nov 14, 2019
018399e
fix NA in repr
jorisvandenbossche Nov 15, 2019
31290b9
Merge remote-tracking branch 'upstream/master' into NA-scalar
TomAugspurger Nov 19, 2019
33fd3e0
remove redundant test
TomAugspurger Nov 19, 2019
289c885
remove dead code
TomAugspurger Nov 19, 2019
22de7cd
Merge remote-tracking branch 'upstream/master' into NA-scalar
jorisvandenbossche Nov 20, 2019
f8208db
fix divmod
jorisvandenbossche Nov 21, 2019
371eeeb
Merge remote-tracking branch 'upstream/master' into NA-scalar
jorisvandenbossche Nov 21, 2019
1cadeda
Merge remote-tracking branch 'upstream/master' into NA-scalar
jorisvandenbossche Nov 25, 2019
1fcf4b7
NA -> C_NA
jorisvandenbossche Nov 25, 2019
f6798e5
start some docs
jorisvandenbossche Nov 26, 2019
14c1434
futher doc updates
jorisvandenbossche Nov 27, 2019
788a2c2
Merge remote-tracking branch 'upstream/master' into NA-scalar
jorisvandenbossche Nov 27, 2019
1bcbab2
doc fixup
jorisvandenbossche Nov 27, 2019
775cdfb
Merge remote-tracking branch 'upstream/master' into NA-scalar
jorisvandenbossche Nov 27, 2019
589a961
further doc updates
jorisvandenbossche Nov 28, 2019
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions pandas/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,7 @@
DatetimeTZDtype,
StringDtype,
# missing
NA,
isna,
isnull,
notna,
Expand Down
5 changes: 3 additions & 2 deletions pandas/_libs/lib.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ from pandas._libs.tslibs.timedeltas cimport convert_to_timedelta64
from pandas._libs.tslibs.timezones cimport get_timezone, tz_compare

from pandas._libs.missing cimport (
checknull, isnaobj, is_null_datetime64, is_null_timedelta64, is_null_period
checknull, isnaobj, is_null_datetime64, is_null_timedelta64, is_null_period, C_NA
)


Expand Down Expand Up @@ -161,6 +161,7 @@ def is_scalar(val: object) -> bool:
or PyTime_Check(val)
# We differ from numpy, which claims that None is not scalar;
# see np.isscalar
or val is C_NA
or val is None
or isinstance(val, (Fraction, Number))
or util.is_period_object(val)
Expand Down Expand Up @@ -1502,7 +1503,7 @@ cdef class Validator:
f'must define is_value_typed')

cdef bint is_valid_null(self, object value) except -1:
return value is None or util.is_nan(value)
return value is None or value is C_NA or util.is_nan(value)

cdef bint is_array_typed(self) except -1:
return False
Expand Down
5 changes: 5 additions & 0 deletions pandas/_libs/missing.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -9,3 +9,8 @@ cpdef ndarray[uint8_t] isnaobj(ndarray arr)
cdef bint is_null_datetime64(v)
cdef bint is_null_timedelta64(v)
cdef bint is_null_period(v)

cdef class C_NAType:
pass

cdef C_NAType C_NA
138 changes: 137 additions & 1 deletion pandas/_libs/missing.pyx
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
import cython
from cython import Py_ssize_t

import numbers

import numpy as np
cimport numpy as cnp
from numpy cimport ndarray, int64_t, uint8_t, float64_t
Expand Down Expand Up @@ -44,7 +46,7 @@ cpdef bint checknull(object val):
The difference between `checknull` and `checknull_old` is that `checknull`
does *not* consider INF or NEGINF to be NA.
"""
return is_null_datetimelike(val, inat_is_null=False)
return val is C_NA or is_null_datetimelike(val, inat_is_null=False)
jreback marked this conversation as resolved.
Show resolved Hide resolved


cpdef bint checknull_old(object val):
Expand Down Expand Up @@ -278,3 +280,137 @@ cdef inline bint is_null_period(v):
# determine if we have a null for a Period (or integer versions),
# excluding np.datetime64('nat') and np.timedelta64('nat')
return checknull_with_nat(v)


# -----------------------------------------------------------------------------
# Implementation of NA singleton


def _create_binary_propagating_op(name, divmod=False):

def method(self, other):
if isinstance(other, numbers.Number) or other is NA or isinstance(other, str):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you put the numbers.Number check last, as it will be least performant

if divmod:
return NA, NA
else:
return NA

return NotImplemented

method.__name__ = name
return method


def _create_unary_propagating_op(name):
def method(self):
return NA

method.__name__ = name
jreback marked this conversation as resolved.
Show resolved Hide resolved
return method


cdef class C_NAType:
pass


class NAType(C_NAType):
"""
NA ("not available") missing value indicator.

.. warning::

Experimental: the behaviour of NA can still change without warning.

.. versionadded:: 1.0.0

The NA singleton is a missing value indicator defined by pandas. It is
used in certain new extension dtypes (currently the "string" dtype).
"""
jorisvandenbossche marked this conversation as resolved.
Show resolved Hide resolved

_instance = None

def __new__(cls, *args, **kwargs):
if NAType._instance is None:
NAType._instance = C_NAType.__new__(cls, *args, **kwargs)
return NAType._instance

def __repr__(self) -> str:
return "NA"

def __str__(self) -> str:
return "NA"

def __bool__(self):
raise TypeError("boolean value of NA is ambiguous")
jorisvandenbossche marked this conversation as resolved.
Show resolved Hide resolved

def __hash__(self):
return id(self)

# Binary arithmetic and comparison ops -> propagate

__add__ = _create_binary_propagating_op("__add__")
__radd__ = _create_binary_propagating_op("__radd__")
__sub__ = _create_binary_propagating_op("__sub__")
__rsub__ = _create_binary_propagating_op("__rsub__")
__mul__ = _create_binary_propagating_op("__mul__")
__rmul__ = _create_binary_propagating_op("__rmul__")
__matmul__ = _create_binary_propagating_op("__matmul__")
__rmatmul__ = _create_binary_propagating_op("__rmatmul__")
__truediv__ = _create_binary_propagating_op("__truediv__")
__rtruediv__ = _create_binary_propagating_op("__rtruediv__")
__floordiv__ = _create_binary_propagating_op("__floordiv__")
__rfloordiv__ = _create_binary_propagating_op("__rfloordiv__")
__mod__ = _create_binary_propagating_op("__mod__")
__rmod__ = _create_binary_propagating_op("__rmod__")
__divmod__ = _create_binary_propagating_op("__divmod__", divmod=True)
__rdivmod__ = _create_binary_propagating_op("__rdivmod__", divmod=True)
__pow__ = _create_binary_propagating_op("__pow__")
__rpow__ = _create_binary_propagating_op("__rpow__")
# __lshift__ and __rshift__ are not implemented

__eq__ = _create_binary_propagating_op("__eq__")
__ne__ = _create_binary_propagating_op("__ne__")
__le__ = _create_binary_propagating_op("__le__")
__lt__ = _create_binary_propagating_op("__lt__")
__gt__ = _create_binary_propagating_op("__gt__")
__ge__ = _create_binary_propagating_op("__ge__")

# Unary ops

__neg__ = _create_unary_propagating_op("__neg__")
__pos__ = _create_unary_propagating_op("__pos__")
__abs__ = _create_unary_propagating_op("__abs__")
__invert__ = _create_unary_propagating_op("__invert__")

# Logical ops using Kleene logic

def __and__(self, other):
if other is False:
return False
elif other is True or other is NA:
return NA
else:
return NotImplemented

__rand__ = __and__

def __or__(self, other):
if other is True:
return True
elif other is False or other is NA:
return NA
else:
return NotImplemented

__ror__ = __or__

def __xor__(self, other):
if other is False or other is True or other is NA:
return NA
jreback marked this conversation as resolved.
Show resolved Hide resolved
return NotImplemented

__rxor__ = __xor__


C_NA = NAType() # C-visible
NA = C_NA # Python-visible
8 changes: 5 additions & 3 deletions pandas/_libs/testing.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -180,13 +180,15 @@ cpdef assert_almost_equal(a, b,
# classes can't be the same, to raise error
assert_class_equal(a, b, obj=obj)

if a == b:
# object comparison
return True
if isna(a) and isna(b):
# TODO: Should require same-dtype NA?
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this might be a reasonable time to start enforcing same-NA

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Open a separate issue for it?

# nan / None comparison
return True

if a == b:
# object comparison
return True

if is_comparable_as_number(a) and is_comparable_as_number(b):
if array_equivalent(a, b, strict_nan=True):
# inf comparison
Expand Down
2 changes: 2 additions & 0 deletions pandas/core/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,3 +56,5 @@

# DataFrame needs to be imported after NamedAgg to avoid a circular import
from pandas.core.frame import DataFrame # isort:skip

from pandas._libs.missing import NA
3 changes: 3 additions & 0 deletions pandas/core/arrays/numpy_.py
Original file line number Diff line number Diff line change
Expand Up @@ -278,6 +278,9 @@ def fillna(self, value=None, method=None, limit=None):
return new_values

def take(self, indices, allow_fill=False, fill_value=None):
if fill_value is None:
# Primarily for subclasses
fill_value = self.dtype.na_value
TomAugspurger marked this conversation as resolved.
Show resolved Hide resolved
result = take(
self._ndarray, indices, allow_fill=allow_fill, fill_value=fill_value
)
Expand Down
39 changes: 17 additions & 22 deletions pandas/core/arrays/string_.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
import operator
from typing import TYPE_CHECKING, Type
from typing import Type

import numpy as np

from pandas._libs import lib
from pandas._libs import lib, missing as libmissing

from pandas.core.dtypes.base import ExtensionDtype
from pandas.core.dtypes.common import pandas_dtype
Expand All @@ -17,9 +17,6 @@
from pandas.core.construction import extract_array
from pandas.core.missing import isna

if TYPE_CHECKING:
from pandas._typing import Scalar


@register_extension_dtype
class StringDtype(ExtensionDtype):
Expand Down Expand Up @@ -50,16 +47,8 @@ class StringDtype(ExtensionDtype):
StringDtype
"""

@property
def na_value(self) -> "Scalar":
"""
StringDtype uses :attr:`numpy.nan` as the missing NA value.

.. warning::

`na_value` may change in a future release.
"""
return np.nan
#: StringDtype.na_value uses pandas.NA
TomAugspurger marked this conversation as resolved.
Show resolved Hide resolved
na_value = libmissing.NA

@property
def type(self) -> Type:
Expand Down Expand Up @@ -149,7 +138,7 @@ class StringArray(PandasArray):
--------
>>> pd.array(['This is', 'some text', None, 'data.'], dtype="string")
<StringArray>
['This is', 'some text', nan, 'data.']
['This is', 'some text', NA, 'data.']
Length: 4, dtype: string

Unlike ``object`` dtype arrays, ``StringArray`` doesn't allow non-string
Expand Down Expand Up @@ -190,10 +179,10 @@ def _from_sequence(cls, scalars, dtype=None, copy=False):
if dtype:
assert dtype == "string"
result = super()._from_sequence(scalars, dtype=object, copy=copy)
# convert None to np.nan
# Standardize all missing-like values to NA
# TODO: it would be nice to do this in _validate / lib.is_string_array
# We are already doing a scan over the values there.
result[result.isna()] = np.nan
result[result.isna()] = StringDtype.na_value
return result

@classmethod
Expand All @@ -210,6 +199,12 @@ def __arrow_array__(self, type=None):
type = pa.string()
return pa.array(self._ndarray, type=type, from_pandas=True)

def _values_for_factorize(self):
arr = self._ndarray.copy()
mask = self.isna()
arr[mask] = -1
return arr, -1

def __setitem__(self, key, value):
value = extract_array(value, extract_numpy=True)
if isinstance(value, type(self)):
Expand All @@ -223,9 +218,9 @@ def __setitem__(self, key, value):

# validate new items
if scalar_value:
if scalar_value is None:
value = np.nan
elif not (isinstance(value, str) or np.isnan(value)):
if isna(value):
jreback marked this conversation as resolved.
Show resolved Hide resolved
value = StringDtype.na_value
elif not isinstance(value, str):
raise ValueError(
"Cannot set non-string value '{}' into a StringArray.".format(value)
)
Expand Down Expand Up @@ -283,7 +278,7 @@ def method(self, other):
other = other[valid]

result = np.empty_like(self._ndarray, dtype="object")
result[mask] = np.nan
result[mask] = StringDtype.na_value
result[valid] = op(self._ndarray[valid], other)

if op.__name__ in {"add", "radd", "mul", "rmul"}:
Expand Down
11 changes: 11 additions & 0 deletions pandas/core/dtypes/missing.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,9 @@ def isna(obj):
>>> pd.isna('dog')
False

>>> pd.isna(pd.NA)
True

>>> pd.isna(np.nan)
True

Expand Down Expand Up @@ -326,6 +329,9 @@ def notna(obj):
>>> pd.notna('dog')
True

>>> pd.notna(pd.NA)
False

>>> pd.notna(np.nan)
False

Expand Down Expand Up @@ -443,6 +449,9 @@ def array_equivalent(left, right, strict_nan: bool = False) -> bool:
if left_value is NaT and right_value is not NaT:
return False

elif left_value is libmissing.NA and right_value is not libmissing.NA:
return False

elif isinstance(left_value, float) and np.isnan(left_value):
if not isinstance(right_value, float) or not np.isnan(right_value):
return False
Expand All @@ -454,6 +463,8 @@ def array_equivalent(left, right, strict_nan: bool = False) -> bool:
if "Cannot compare tz-naive" in str(err):
# tzawareness compat failure, see GH#28507
return False
elif "boolean value of NA is ambiguous" in str(err):
return False
raise
return True

Expand Down
3 changes: 3 additions & 0 deletions pandas/io/formats/format.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@
from pandas._config.config import get_option, set_option

from pandas._libs import lib
from pandas._libs.missing import NA
from pandas._libs.tslib import format_array_from_datetime
from pandas._libs.tslibs import NaT, Timedelta, Timestamp, iNaT
from pandas._libs.tslibs.nattype import NaTType
Expand Down Expand Up @@ -1218,6 +1219,8 @@ def _format(x):
# determine na_rep if x is None or NaT-like
if x is None:
return "None"
elif x is NA:
return "NA"
elif x is NaT or np.isnat(x):
return "NaT"
except (TypeError, ValueError):
Expand Down
2 changes: 1 addition & 1 deletion pandas/tests/api/test_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ class TestPDApi(Base):
deprecated_modules = [] # type: List[str]

# misc
misc = ["IndexSlice", "NaT"]
misc = ["IndexSlice", "NaT", "NA"]

# top-level classes
classes = [
Expand Down
Loading