pandas-dev · jreback · Dec 1, 2019 · Nov 12, 2019 · Nov 13, 2019 · Nov 13, 2019
diff --git a/pandas/__init__.py b/pandas/__init__.py
@@ -68,6 +68,7 @@
     DatetimeTZDtype,
     StringDtype,
     # missing
+    NA,
     isna,
     isnull,
     notna,

diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx
@@ -58,7 +58,7 @@ from pandas._libs.tslibs.timedeltas cimport convert_to_timedelta64
 from pandas._libs.tslibs.timezones cimport get_timezone, tz_compare
 
 from pandas._libs.missing cimport (
-    checknull, isnaobj, is_null_datetime64, is_null_timedelta64, is_null_period
+    checknull, isnaobj, is_null_datetime64, is_null_timedelta64, is_null_period, C_NA
 )
 
 
@@ -161,6 +161,7 @@ def is_scalar(val: object) -> bool:
             or PyTime_Check(val)
             # We differ from numpy, which claims that None is not scalar;
             # see np.isscalar
+            or val is C_NA
             or val is None
             or isinstance(val, (Fraction, Number))
             or util.is_period_object(val)
@@ -1502,7 +1503,7 @@ cdef class Validator:
                                   f'must define is_value_typed')
 
     cdef bint is_valid_null(self, object value) except -1:
-        return value is None or util.is_nan(value)
+        return value is None or value is C_NA or util.is_nan(value)
 
     cdef bint is_array_typed(self) except -1:
         return False

diff --git a/pandas/_libs/missing.pxd b/pandas/_libs/missing.pxd
@@ -9,3 +9,8 @@ cpdef ndarray[uint8_t] isnaobj(ndarray arr)
 cdef bint is_null_datetime64(v)
 cdef bint is_null_timedelta64(v)
 cdef bint is_null_period(v)
+
+cdef class C_NAType:
+    pass
+
+cdef C_NAType C_NA
diff --git a/pandas/_libs/missing.pyx b/pandas/_libs/missing.pyx
@@ -1,6 +1,8 @@
 import cython
 from cython import Py_ssize_t
 
+import numbers
+
 import numpy as np
 cimport numpy as cnp
 from numpy cimport ndarray, int64_t, uint8_t, float64_t
@@ -44,7 +46,7 @@ cpdef bint checknull(object val):
     The difference between `checknull` and `checknull_old` is that `checknull`
     does *not* consider INF or NEGINF to be NA.
     """
-    return is_null_datetimelike(val, inat_is_null=False)
+    return val is C_NA or is_null_datetimelike(val, inat_is_null=False)
 
 
 cpdef bint checknull_old(object val):
@@ -278,3 +280,137 @@ cdef inline bint is_null_period(v):
     # determine if we have a null for a Period (or integer versions),
     # excluding np.datetime64('nat') and np.timedelta64('nat')
     return checknull_with_nat(v)
+
+
+# -----------------------------------------------------------------------------
+# Implementation of NA singleton
+
+
+def _create_binary_propagating_op(name, divmod=False):
+
+    def method(self, other):
+        if isinstance(other, numbers.Number) or other is NA or isinstance(other, str):
+            if divmod:
+                return NA, NA
+            else:
+                return NA
+
+        return NotImplemented
+
+    method.__name__ = name
+    return method
+
+
+def _create_unary_propagating_op(name):
+    def method(self):
+        return NA
+
+    method.__name__ = name
+    return method
+
+
+cdef class C_NAType:
+    pass
+
+
+class NAType(C_NAType):
+    """
+    NA ("not available") missing value indicator.
+
+    .. warning::
+
+       Experimental: the behaviour of NA can still change without warning.
+
+    .. versionadded:: 1.0.0
+
+    The NA singleton is a missing value indicator defined by pandas. It is
+    used in certain new extension dtypes (currently the "string" dtype).
+    """
+
+    _instance = None
+
+    def __new__(cls, *args, **kwargs):
+        if NAType._instance is None:
+            NAType._instance = C_NAType.__new__(cls, *args, **kwargs)
+        return NAType._instance
+
+    def __repr__(self) -> str:
+        return "NA"
+
+    def __str__(self) -> str:
+        return "NA"
+
+    def __bool__(self):
+        raise TypeError("boolean value of NA is ambiguous")
+
+    def __hash__(self):
+        return id(self)
+
+    # Binary arithmetic and comparison ops -> propagate
+
+    __add__ = _create_binary_propagating_op("__add__")
+    __radd__ = _create_binary_propagating_op("__radd__")
+    __sub__ = _create_binary_propagating_op("__sub__")
+    __rsub__ = _create_binary_propagating_op("__rsub__")
+    __mul__ = _create_binary_propagating_op("__mul__")
+    __rmul__ = _create_binary_propagating_op("__rmul__")
+    __matmul__ = _create_binary_propagating_op("__matmul__")
+    __rmatmul__ = _create_binary_propagating_op("__rmatmul__")
+    __truediv__ = _create_binary_propagating_op("__truediv__")
+    __rtruediv__ = _create_binary_propagating_op("__rtruediv__")
+    __floordiv__ = _create_binary_propagating_op("__floordiv__")
+    __rfloordiv__ = _create_binary_propagating_op("__rfloordiv__")
+    __mod__ = _create_binary_propagating_op("__mod__")
+    __rmod__ = _create_binary_propagating_op("__rmod__")
+    __divmod__ = _create_binary_propagating_op("__divmod__", divmod=True)
+    __rdivmod__ = _create_binary_propagating_op("__rdivmod__", divmod=True)
+    __pow__ = _create_binary_propagating_op("__pow__")
+    __rpow__ = _create_binary_propagating_op("__rpow__")
+    # __lshift__ and __rshift__ are not implemented
+
+    __eq__ = _create_binary_propagating_op("__eq__")
+    __ne__ = _create_binary_propagating_op("__ne__")
+    __le__ = _create_binary_propagating_op("__le__")
+    __lt__ = _create_binary_propagating_op("__lt__")
+    __gt__ = _create_binary_propagating_op("__gt__")
+    __ge__ = _create_binary_propagating_op("__ge__")
+
+    # Unary ops
+
+    __neg__ = _create_unary_propagating_op("__neg__")
+    __pos__ = _create_unary_propagating_op("__pos__")
+    __abs__ = _create_unary_propagating_op("__abs__")
+    __invert__ = _create_unary_propagating_op("__invert__")
+
+    # Logical ops using Kleene logic
+
+    def __and__(self, other):
+        if other is False:
+            return False
+        elif other is True or other is NA:
+            return NA
+        else:
+            return NotImplemented
+
+    __rand__ = __and__
+
+    def __or__(self, other):
+        if other is True:
+            return True
+        elif other is False or other is NA:
+            return NA
+        else:
+            return NotImplemented
+
+    __ror__ = __or__
+
+    def __xor__(self, other):
+        if other is False or other is True or other is NA:
+            return NA
+        return NotImplemented
+
+    __rxor__ = __xor__
+
+
+C_NA = NAType()   # C-visible
+NA = C_NA         # Python-visible
diff --git a/pandas/_libs/testing.pyx b/pandas/_libs/testing.pyx
@@ -180,13 +180,15 @@ cpdef assert_almost_equal(a, b,
         # classes can't be the same, to raise error
         assert_class_equal(a, b, obj=obj)
 
-    if a == b:
-        # object comparison
-        return True
     if isna(a) and isna(b):
         # TODO: Should require same-dtype NA?
         # nan / None comparison
         return True
+
+    if a == b:
+        # object comparison
+        return True
+
     if is_comparable_as_number(a) and is_comparable_as_number(b):
         if array_equivalent(a, b, strict_nan=True):
             # inf comparison

diff --git a/pandas/core/api.py b/pandas/core/api.py
@@ -56,3 +56,5 @@
 
 # DataFrame needs to be imported after NamedAgg to avoid a circular import
 from pandas.core.frame import DataFrame  # isort:skip
+
+from pandas._libs.missing import NA
diff --git a/pandas/core/arrays/numpy_.py b/pandas/core/arrays/numpy_.py
@@ -278,6 +278,9 @@ def fillna(self, value=None, method=None, limit=None):
         return new_values
 
     def take(self, indices, allow_fill=False, fill_value=None):
+        if fill_value is None:
+            # Primarily for subclasses
+            fill_value = self.dtype.na_value
         result = take(
             self._ndarray, indices, allow_fill=allow_fill, fill_value=fill_value
         )

diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py
@@ -1,9 +1,9 @@
 import operator
-from typing import TYPE_CHECKING, Type
+from typing import Type
 
 import numpy as np
 
-from pandas._libs import lib
+from pandas._libs import lib, missing as libmissing
 
 from pandas.core.dtypes.base import ExtensionDtype
 from pandas.core.dtypes.common import pandas_dtype
@@ -17,9 +17,6 @@
 from pandas.core.construction import extract_array
 from pandas.core.missing import isna
 
-if TYPE_CHECKING:
-    from pandas._typing import Scalar
-
 
 @register_extension_dtype
 class StringDtype(ExtensionDtype):
@@ -50,16 +47,8 @@ class StringDtype(ExtensionDtype):
     StringDtype
     """
 
-    @property
-    def na_value(self) -> "Scalar":
-        """
-        StringDtype uses :attr:`numpy.nan` as the missing NA value.
-
-        .. warning::
-
-           `na_value` may change in a future release.
-        """
-        return np.nan
+    #: StringDtype.na_value uses pandas.NA
+    na_value = libmissing.NA
 
     @property
     def type(self) -> Type:
@@ -149,7 +138,7 @@ class StringArray(PandasArray):
     --------
     >>> pd.array(['This is', 'some text', None, 'data.'], dtype="string")
     <StringArray>
-    ['This is', 'some text', nan, 'data.']
+    ['This is', 'some text', NA, 'data.']
     Length: 4, dtype: string
 
     Unlike ``object`` dtype arrays, ``StringArray`` doesn't allow non-string
@@ -190,10 +179,10 @@ def _from_sequence(cls, scalars, dtype=None, copy=False):
         if dtype:
             assert dtype == "string"
         result = super()._from_sequence(scalars, dtype=object, copy=copy)
-        # convert None to np.nan
+        # Standardize all missing-like values to NA
         # TODO: it would be nice to do this in _validate / lib.is_string_array
         # We are already doing a scan over the values there.
-        result[result.isna()] = np.nan
+        result[result.isna()] = StringDtype.na_value
         return result
 
     @classmethod
@@ -210,6 +199,12 @@ def __arrow_array__(self, type=None):
             type = pa.string()
         return pa.array(self._ndarray, type=type, from_pandas=True)
 
+    def _values_for_factorize(self):
+        arr = self._ndarray.copy()
+        mask = self.isna()
+        arr[mask] = -1
+        return arr, -1
+
     def __setitem__(self, key, value):
         value = extract_array(value, extract_numpy=True)
         if isinstance(value, type(self)):
@@ -223,9 +218,9 @@ def __setitem__(self, key, value):
 
         # validate new items
         if scalar_value:
-            if scalar_value is None:
-                value = np.nan
-            elif not (isinstance(value, str) or np.isnan(value)):
+            if isna(value):
+                value = StringDtype.na_value
+            elif not isinstance(value, str):
                 raise ValueError(
                     "Cannot set non-string value '{}' into a StringArray.".format(value)
                 )
@@ -283,7 +278,7 @@ def method(self, other):
                 other = other[valid]
 
             result = np.empty_like(self._ndarray, dtype="object")
-            result[mask] = np.nan
+            result[mask] = StringDtype.na_value
             result[valid] = op(self._ndarray[valid], other)
 
             if op.__name__ in {"add", "radd", "mul", "rmul"}:

diff --git a/pandas/core/dtypes/missing.py b/pandas/core/dtypes/missing.py
@@ -79,6 +79,9 @@ def isna(obj):
     >>> pd.isna('dog')
     False
 
+    >>> pd.isna(pd.NA)
+    True
+
     >>> pd.isna(np.nan)
     True
 
@@ -326,6 +329,9 @@ def notna(obj):
     >>> pd.notna('dog')
     True
 
+    >>> pd.notna(pd.NA)
+    False
+
     >>> pd.notna(np.nan)
     False
 
@@ -443,6 +449,9 @@ def array_equivalent(left, right, strict_nan: bool = False) -> bool:
             if left_value is NaT and right_value is not NaT:
                 return False
 
+            elif left_value is libmissing.NA and right_value is not libmissing.NA:
+                return False
+
             elif isinstance(left_value, float) and np.isnan(left_value):
                 if not isinstance(right_value, float) or not np.isnan(right_value):
                     return False
@@ -454,6 +463,8 @@ def array_equivalent(left, right, strict_nan: bool = False) -> bool:
                     if "Cannot compare tz-naive" in str(err):
                         # tzawareness compat failure, see GH#28507
                         return False
+                    elif "boolean value of NA is ambiguous" in str(err):
+                        return False
                     raise
         return True
 

diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py
@@ -35,6 +35,7 @@
 from pandas._config.config import get_option, set_option
 
 from pandas._libs import lib
+from pandas._libs.missing import NA
 from pandas._libs.tslib import format_array_from_datetime
 from pandas._libs.tslibs import NaT, Timedelta, Timestamp, iNaT
 from pandas._libs.tslibs.nattype import NaTType
@@ -1218,6 +1219,8 @@ def _format(x):
                     # determine na_rep if x is None or NaT-like
                     if x is None:
                         return "None"
+                    elif x is NA:
+                        return "NA"
                     elif x is NaT or np.isnat(x):
                         return "NaT"
                 except (TypeError, ValueError):

diff --git a/pandas/tests/api/test_api.py b/pandas/tests/api/test_api.py
@@ -46,7 +46,7 @@ class TestPDApi(Base):
     deprecated_modules = []  # type: List[str]
 
     # misc
-    misc = ["IndexSlice", "NaT"]
+    misc = ["IndexSlice", "NaT", "NA"]
 
     # top-level classes
     classes = [
-Original file line number
+Diff line change
@@ Expand Up / @@ -68,6 +68,7 @@ @@
         DatetimeTZDtype,
         StringDtype,
         # missing
+        NA,
         isna,
         isnull,
         notna,
@@ Expand Down @@
Original file line number	Diff line number	Diff line change
Expand Up		@@ -56,3 +56,5 @@

		# DataFrame needs to be imported after NamedAgg to avoid a circular import
		from pandas.core.frame import DataFrame # isort:skip

		from pandas._libs.missing import NA