Skip to content

Commit

Permalink
Progress: 2608 pass, 97 skip, 84 xfail, 6 xpass
Browse files Browse the repository at this point in the history
With these changes (and pandas-dev/pandas#53970 and hgrecco/pint#1615) the test suite passes or xpasses everything (no failures or error).  Indeed, the uncertainties code has essentially doubled the scope of the test suite (to test with and without it).

The biggest gotcha is that the EA for complex numbers is not compatible with the EA for uncertainties, due to incompatible hacks:

The hack for complex numbers is to np.nan (which is, technically, a complex number) for na_value across all numeric types.  But that doesn't work for uncertainties, because uncertainties doesn't accept np.nan as an uncertain value.

The hack for uncertainties is to use pd.NA for na_value.  This works for Int64, Float64, and uncertainties, but doesn't work for complex (which cannot tolerate NAType).

Some careful subclassing fills in what doesn't easily work, with fixtures to prevent the improper mixing of complex and uncertainty types in the same python environment.  Happy to discuss!

Signed-off-by: Michael Tiemann <72577720+MichaelTiemannOSC@users.noreply.github.com>
  • Loading branch information
MichaelTiemannOSC committed Jul 5, 2023
1 parent dbf5ad1 commit 3c6eff4
Show file tree
Hide file tree
Showing 3 changed files with 235 additions and 124 deletions.
198 changes: 145 additions & 53 deletions pint_pandas/pint_array.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ def __new__(cls, units=None):
if not isinstance(units, _Unit):
units = cls._parse_dtype_strict(units)
# ureg.unit returns a quantity with a magnitude of 1
# eg 1 mm. Initialising a quantity and taking it's unit
# eg 1 mm. Initialising a quantity and taking its unit
# TODO: Seperate units from quantities in pint
# to simplify this bit
units = cls.ureg.Quantity(1, units).units
Expand Down Expand Up @@ -148,7 +148,10 @@ def name(self):

@property
def na_value(self):
return self.ureg.Quantity(pd.NA, self.units)
if HAS_UNCERTAINTIES:
return self.ureg.Quantity(pd.NA, self.units)
else:
return self.ureg.Quantity(np.nan, self.units)

def __hash__(self):
# make myself hashable
Expand Down Expand Up @@ -318,12 +321,41 @@ def __setitem__(self, key, value):
# doing nothing here seems to be ok
return

master_scalar = None
try:
master_scalar = next(i for i in self._data if pd.notna(i))
except StopIteration:
pass

if isinstance(value, _Quantity):
value = value.to(self.units).magnitude
elif is_list_like(value) and len(value) > 0 and isinstance(value[0], _Quantity):
value = [item.to(self.units).magnitude for item in value]
elif is_list_like(value) and len(value) > 0:
if isinstance(value[0], _Quantity):
value = [item.to(self.units).magnitude for item in value]
elif HAS_UNCERTAINTIES and isinstance(master_scalar, UFloat):
if not all([isinstance(i, UFloat) or pd.isna(i) for i in value]):
value = [
i if isinstance(i, UFloat) or pd.isna(i) else ufloat(i, 0)
for i in value
]
if len(value) == 1:
value = value[0]

key = check_array_indexer(self, key)
# Filter out invalid values for our array type(s)
if HAS_UNCERTAINTIES:
if isinstance(value, UFloat):
pass
elif is_list_like(value):
from pandas.core.dtypes.common import is_scalar

if is_scalar(key):
msg = "Value must be scalar. {}".format(value)
raise ValueError(msg)
elif type(value) is object:
if pd.notna(value):
msg = "Invalid object. {}".format(value)
raise ValueError(msg)
try:
self._data[key] = value
except IndexError as e:
Expand Down Expand Up @@ -535,45 +567,24 @@ def _from_sequence(cls, scalars, dtype=None, copy=False):
if dtype is None and isinstance(master_scalar, _Quantity):
dtype = PintType(master_scalar.units)

def quantify_nan(item, promote_to_ufloat):
if pd.isna(item):
return dtype.ureg.Quantity(item, dtype.units)
# FIXME: most of this code is never executed (except the final return)
if promote_to_ufloat:
if type(item) is UFloat:
return item * dtype.units
if type(item) is float:
if np.isnan(item):
return _ufloat_nan * dtype.units
else:
return UFloat(item, 0) * dtype.units
else:
if type(item) is float:
return item * dtype.units
return item

if isinstance(master_scalar, _Quantity):
# A quantified master_scalar does not guarantee that we don't have NA and/or np.nan values in our scalars
if HAS_UNCERTAINTIES:
promote_to_ufloat = any(
[isinstance(item.m, UFloat) for item in scalars if pd.notna(item)]
)
else:
promote_to_ufloat = False
scalars = [
item
if isinstance(item, _Quantity)
else quantify_nan(item, promote_to_ufloat)
for item in scalars
]
promote_to_ufloat = False
scalars = [
(item.to(dtype.units).magnitude if hasattr(item, "to") else item)
for item in scalars
]
elif HAS_UNCERTAINTIES:
promote_to_ufloat = any([isinstance(item, UFloat) for item in scalars])
# When creating empty arrays, make them large enoguh to hold UFloats in case we need to do so later
if len(scalars) == 0:
promote_to_ufloat = True
else:
promote_to_ufloat = any([isinstance(item, UFloat) for item in scalars])
else:
promote_to_ufloat = False
if len(scalars) == 0:
if promote_to_ufloat:
return cls([_ufloat_nan], dtype=dtype, copy=copy)[1:]
return cls(scalars, dtype=dtype, copy=copy)
if promote_to_ufloat:
scalars = [
item
Expand Down Expand Up @@ -639,6 +650,10 @@ def factorize(
# Complete control over factorization.
if HAS_UNCERTAINTIES and self._data.dtype.kind == "O":
arr, na_value = self._values_for_factorize()
# Unique elements make it easy to partition on na_value if we need to
arr_list = list(dict.fromkeys(arr))
na_index = len(arr_list)
arr = np.array(arr_list)

if not use_na_sentinel:
# factorize can now handle differentiating various types of null values.
Expand All @@ -649,36 +664,51 @@ def factorize(
if null_mask.any():
# Don't modify (potentially user-provided) array
arr = np.where(null_mask, na_value, arr)

codes = [-1] * len(self.data)
# Note that item is a local variable provided in the loop below
else:
try:
na_index = arr.tolist().index(na_value)
except ValueError:
# Keep as len(arr)
pass
codes = np.array([-1] * len(self.data), dtype=np.intp)
# Note: item is a local variable provided in the loop below
# Note: partitioning arr on pd.NA means item is never pd.NA
vf = np.vectorize(
lambda x: True
if (x_na := pd.isna(x)) * (item_na := pd.isna(item))
else (x_na == item_na and x == item),
lambda x: False if pd.isna(x) else x == item,
otypes=[bool],
)
for code, item in enumerate(arr):
for code, item in enumerate(arr[: na_index + 1]):
code_mask = vf(self._data)
# Don't count the NA we have seen
codes = np.where(code_mask, code, codes)

uniques_ea = self._from_factorized(arr, self)
if use_na_sentinel and na_index < len(arr):
for code, item in enumerate(arr[na_index:]):
code_mask = vf(self._data)
# Don't count the NA we have seen
codes = np.where(code_mask, code, codes)
uniques_ea = self._from_factorized(
arr[:na_index].tolist() + arr[na_index + 1 :].tolist(), self
)
else:
uniques_ea = self._from_factorized(arr, self)
return codes, uniques_ea
else:
return super(PintArray, self).factorize(self, use_na_sentinel)
return super(PintArray, self).factorize(use_na_sentinel)

@classmethod
def _from_factorized(cls, values, original):
from pandas._libs.lib import infer_dtype
if infer_dtype(values) != "object":
values = pd.array(values, copy=False)
return cls(values, dtype=original.dtype)

def _values_for_factorize(self):
arr = self._data
if HAS_UNCERTAINTIES and arr.dtype.kind == "O":
unique_data = []
for item in arr:
if item not in unique_data:
unique_data.append(item)
return np.array(unique_data), pd.NA
if arr.dtype.kind == "O":
if HAS_UNCERTAINTIES and arr.size > 0 and isinstance(arr[0], UFloat):
# Canonicalize uncertain NaNs
arr = np.where(unp.isnan(arr), self.dtype.na_value.m, arr)
return np.array(arr, copy=False), self.dtype.na_value.m
return arr._values_for_factorize()

def value_counts(self, dropna=True):
Expand Down Expand Up @@ -706,7 +736,7 @@ def value_counts(self, dropna=True):
# compute counts on the data with no nans
data = self._data
nafilt = data.isna()
na_value = pd.NA
na_value = self.dtype.na_value.m
data = data[~nafilt]
if HAS_UNCERTAINTIES and data.dtype.kind == "O":
unique_data = []
Expand Down Expand Up @@ -746,6 +776,68 @@ def unique(self):
)
return self._from_sequence(unique(data), dtype=self.dtype)

def shift(self, periods: int = 1, fill_value=None):
"""
Shift values by desired number.
Newly introduced missing values are filled with
a missing value type consistent with the existing elements
or ``self.dtype.na_value`` if none exist.
Parameters
----------
periods : int, default 1
The number of periods to shift. Negative values are allowed
for shifting backwards.
fill_value : object, optional
The scalar value to use for newly introduced missing values.
The default is ``self.dtype.na_value``.
Returns
-------
ExtensionArray
Shifted.
Notes
-----
If ``self`` is empty or ``periods`` is 0, a copy of ``self`` is
returned.
If ``periods > len(self)``, then an array of size
len(self) is returned, with all values filled with
``self.dtype.na_value``.
For 2-dimensional ExtensionArrays, we are always shifting along axis=0.
"""
if not len(self) or periods == 0:
return self.copy()

if pd.isna(fill_value):
fill_value = self.dtype.na_value.m

if HAS_UNCERTAINTIES:
if self.data.dtype.kind == "O":
try:
notna_value = next(i for i in self._data if pd.notna(i))
if isinstance(notna_value, UFloat):
fill_value = _ufloat_nan
except StopIteration:
pass
elif self.data.dtype.kind == "f":
fill_value = np.nan

empty = self._from_sequence(
[fill_value] * min(abs(periods), len(self)), dtype=self.dtype
)
if periods > 0:
a = empty
b = self[:-periods]
else:
a = self[abs(periods) :]
b = empty
return self._concat_same_type([a, b])

def __contains__(self, item) -> bool:
if not isinstance(item, _Quantity):
return False
Expand Down Expand Up @@ -895,7 +987,7 @@ def __array__(self, dtype=None, copy=False):

def _to_array_of_quantity(self, copy=False):
qtys = [
self._Q(item, self._dtype.units) if item is not pd.NA else item
self._Q(item, self._dtype.units) if item is not self.dtype.na_value.m else item
for item in self._data
]
with warnings.catch_warnings(record=True):
Expand Down
11 changes: 4 additions & 7 deletions pint_pandas/testsuite/test_issues.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ class TestIssue21(BaseExtensionTests):
def test_offset_concat(self):
q_a = ureg.Quantity(np.arange(5) + ufloat(0, 0), ureg.Unit("degC"))
q_b = ureg.Quantity(np.arange(6) + ufloat(0, 0), ureg.Unit("degC"))
q_a_ = np.append(q_a, ureg.Quantity(ufloat(np.nan, 0), ureg.Unit("degC")))
q_a_ = np.append(q_a, ureg.Quantity(pd.NA, ureg.Unit("degC")))

a = pd.Series(PintArray(q_a))
b = pd.Series(PintArray(q_b))
Expand Down Expand Up @@ -179,13 +179,10 @@ def test_issue_127():
assert a == b


@pytest.mark.skipif(
not HAS_UNCERTAINTIES, reason="this test depends entirely on HAS_UNCERTAINTIES being True"
)
def test_issue_139():
from pint.compat import HAS_UNCERTAINTIES

assert HAS_UNCERTAINTIES
from uncertainties import ufloat
from uncertainties import unumpy as unp

q1 = 1.234
q2 = 5.678
q_nan = np.nan
Expand Down
Loading

0 comments on commit 3c6eff4

Please sign in to comment.