Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

v1.16.1 #374

Merged
merged 1 commit into from
Apr 9, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 7 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,13 @@ profile = "black"

[tool.pytest.ini_options]
xfail_strict = true
filterwarnings = ["error::DeprecationWarning", "error:FutureWarning"]
filterwarnings = [
"error::DeprecationWarning",
"error:FutureWarning",
# FastArray converts all scalars into 1-dim arrays; big change required
"ignore:Conversion of an array with ndim > 0 to a scalar is deprecated:DeprecationWarning",
"ignore:np.find_common_type is deprecated.:DeprecationWarning",
]

[tool.pydocstyle]
convention = "numpy"
Expand Down
6 changes: 3 additions & 3 deletions riptable/rt_accumtable.py
Original file line number Diff line number Diff line change
Expand Up @@ -1346,8 +1346,8 @@ def accum_cols(cat, val_list, name_list=None, filt_list=None, func_list="nansum"
val_list : array or list of arrays
Array or list of arrays that ``func_list`` is applied to.
:py:func:`~.rt_accumtable.accum_cols` returns an array for each element in
``val_list``. If an element of ``val_list`` is itself a two-element list of two
arrays, :py:func:`~.rt_accumtable.accum_cols` calculates a ratio between the
``val_list``. If an element of ``val_list`` is itself a two-element list or tuple of
two arrays, :py:func:`~.rt_accumtable.accum_cols` calculates a ratio between the
values calculated by a reducing function for the two arrays.
:py:func:`~.rt_accumtable.accum_ratio` performs this calculation using ``cat``,
the two arrays, the respective filter, and the respective reducing function as
Expand Down Expand Up @@ -1628,7 +1628,7 @@ def accum_cols(cat, val_list, name_list=None, filt_list=None, func_list="nansum"
for val, name, filt, func in zip(val_list, name_list, filt_list, func_list):
func_name = func
func = getattr(accum, func_name)
if isinstance(val, list): # Special cases
if isinstance(val, (list, tuple)): # Special cases
if isinstance(val[1], str): # Named cases
if val[1] in "pP": # accum_ratiop type
curr_data = accum_ratiop(cat, temp_cat, val[0], filt, func_name, "T", False, False)
Expand Down
3 changes: 2 additions & 1 deletion riptable/rt_categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
List,
Mapping,
Optional,
Sequence,
Tuple,
Union,
)
Expand Down Expand Up @@ -6355,7 +6356,7 @@ def hstack(cls, cats: Collection["Categorical"]) -> "Categorical":
# ------------------------------------------------------------
@classmethod
def categories_equal(
cls, cats: List[Union["Categorical", np.ndarray, Tuple[np.ndarray, ...]]]
cls, cats: Sequence[Union["Categorical", np.ndarray, Tuple[np.ndarray, ...]]]
) -> Tuple[bool, List["Categorical"]]:
"""
Check if every `Categorical` or array has the same categories (same unique values in the same order).
Expand Down
61 changes: 45 additions & 16 deletions riptable/rt_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -1469,7 +1469,7 @@ def single_array(col_idx, row_idx):
return self._copy(deep=False, rows=row_idx, cols=col_idx)

# ------------------------------------------------------------
def _dataset_compare_check(self, func_name, lhs) -> Self:
def _dataset_compare_check(self, func_name: str, lhs: Dataset, include_label_columns: bool = True) -> Self:
# comparison function will be called by an array the size of the indexes, either
# interperetted as integers, or as categorical strings
# if compared to string, make sure the string matches the string type in categories
Expand All @@ -1481,18 +1481,28 @@ def _dataset_compare_check(self, func_name, lhs) -> Self:
raise ValueError("The two Datasets have different lengths and cannot be compared")
else:
# returns a new dataset

label_cols = self.label_get_names()
lhs_label_cols = lhs.label_get_names()
newds = {}
# for all columns that match
for colname in self.keys():
# if the lhs dataset has the same column name, compare
if hasattr(lhs, colname):
# get the function reference for the comparison operator
func = getattr(self[colname], func_name)
# add the boolean array to the new dataset
newds[colname] = func(lhs[colname])
else:
if not include_label_columns and colname in label_cols:
continue

if not (hasattr(lhs, colname)) or not include_label_columns and colname in lhs_label_cols:
newds[colname] = np.array([False] * nrows)
continue

# if the lhs dataset has the same column name, compare
# get the function reference for the comparison operator
func = getattr(self[colname], func_name)
# add the boolean array to the new dataset
newds[colname] = func(lhs[colname])

for colname in lhs:
if not include_label_columns and colname in lhs_label_cols:
continue
if colname not in newds:
newds[colname] = np.array([False] * nrows)
return type(self)(newds)
Expand Down Expand Up @@ -2000,7 +2010,7 @@ def imatrix_make(
# -------------------------------------------------------
# 2d arithmetic functions.
def imatrix_y(
self, func: Union[Callable, str, List[Union[Callable, str]]], name: Optional[Union[str, List[str]]] = None
self, func: Union[Callable, str, Sequence[Union[Callable, str]]], name: Optional[Union[str, List[str]]] = None
) -> "Dataset":
"""
Parameters
Expand Down Expand Up @@ -2047,7 +2057,7 @@ def imatrix_y(
# -------------------------------------------------------
# 2d arithmetic functions.
def _imatrix_y_internal(
self, func, name: Optional[str] = None, showfilter: bool = True
self, func: Union[Callable, str], name: Optional[str] = None, showfilter: bool = True
) -> Optional[Tuple[Any, str, Callable]]:
"""
Parameters
Expand Down Expand Up @@ -4842,7 +4852,7 @@ def merge(
def merge2(
self,
right: "Dataset",
on: Optional[Union[str, Tuple[str, str], List[Union[str, Tuple[str, str]]]]] = None,
on: Optional[Union[str, Tuple[str, str], Sequence[Union[str, Tuple[str, str]]]]] = None,
left_on: Optional[Union[str, List[str]]] = None,
right_on: Optional[Union[str, List[str]]] = None,
how: str = "left",
Expand Down Expand Up @@ -4882,7 +4892,7 @@ def merge_asof(
on: Optional[Union[str, Tuple[str, str]]] = None,
left_on: Optional[str] = None,
right_on: Optional[str] = None,
by: Optional[Union[str, Tuple[str, str], List[Union[str, Tuple[str, str]]]]] = None,
by: Optional[Union[str, Tuple[str, str], Sequence[Union[str, Tuple[str, str]]]]] = None,
left_by: Optional[Union[str, List[str]]] = None,
right_by: Optional[Union[str, List[str]]] = None,
suffixes: Optional[Tuple[str, str]] = None,
Expand Down Expand Up @@ -4923,7 +4933,7 @@ def merge_asof(
def merge_lookup(
self,
right: "Dataset",
on: Optional[Union[str, Tuple[str, str], List[Union[str, Tuple[str, str]]]]] = None,
on: Optional[Union[str, Tuple[str, str], Sequence[Union[str, Tuple[str, str]]]]] = None,
left_on: Optional[Union[str, List[str]]] = None,
right_on: Optional[Union[str, List[str]]] = None,
require_match: bool = False,
Expand Down Expand Up @@ -8134,7 +8144,7 @@ def make_dataset(coldict, val, newds):
return ms

# -------------------------------------------------------
def equals(self, other, axis: Optional[int] = None, labels: bool = False, exact: bool = False):
def equals(self, other, axis: Optional[int] = None, labels: bool = True, exact: bool = False):
"""
Test whether two Datasets contain the same elements in each column.
NaNs in the same location are considered equal.
Expand Down Expand Up @@ -8246,8 +8256,27 @@ def ds_isnan(ds):

else:
try:
result = self.apply_cols(isnan, labels=labels) & other.apply_cols(isnan, labels=labels)
result |= self == other
compare_result = self._dataset_compare_check("__eq__", other, include_label_columns=labels)
if len(compare_result) == 0:
# Empty result: could be either DS({}).equals(DS({})) or DS({'a': []}).equals({'b': []})
cols = set(self.keys())
other_cols = set(other.keys())

if not labels:
cols = cols.difference(self.label_get_names())
other_cols = other_cols.difference(other.label_get_names())

return cols == other_cols if axis != 1 else result

fill_false = lambda col: zeros(len(col), dtype=bool)
self_nans = self.apply_cols(isnan, fill_value=fill_false, labels=labels)
other_nans = other.apply_cols(isnan, fill_value=fill_false, labels=labels)
nan_result = (
(self_nans & other_nans) if len(self.values()) > len(other.values()) else (other_nans & self_nans)
)
result = Dataset({})
for colname in compare_result:
result[colname] = nan_result[colname] | compare_result[colname]
except:
result = False
if axis != 1:
Expand Down
20 changes: 20 additions & 0 deletions riptable/rt_fastarray.py
Original file line number Diff line number Diff line change
Expand Up @@ -1325,6 +1325,19 @@ def __getitem__(self, fld) -> FastArray:
if len(self.shape) == 1:
return TypeRegister.MathLedger._MBGET(self, fld)

# Workaround for indexing a 2d FastArray with invalid indices (eg produced by merge functions)
# At each invalid index, give a row of NaN's / Inv's
if isinstance(fld, FastArray) and self.ndim == 2 and fld.ndim == 1:
f_inv = fld.isna()
if f_inv.any():
# Temporarily remove invalid indices and get item
fld = fld.fillna(0)
result = TypeRegister.MathLedger._GETITEM(super(FastArray, self), fld)
# Add back in null row where invalid indices were
null_row = np.ndarray.__getitem__(self, 0).copy_invalid()
result.__setitem__(f_inv, null_row)
return result.view(FastArray)

result = TypeRegister.MathLedger._GETITEM(super(FastArray, self), fld)
return result.view(FastArray)
else:
Expand Down Expand Up @@ -5177,6 +5190,13 @@ def __array_ufunc__(self, ufunc: Callable, method: str, *inputs: Any, **kwargs:
# replace out
kwargs["out"] = tuple(out_args)

# NumPy 1.25 permits overriding "where" (https://numpy.org/doc/stable/release/1.25.0-notes.html#array-likes-that-define-array-ufunc-can-now-override-ufuncs-if-used-as-where)
# TODO: should we just unwrap all FA kwargs?
if "where" in kwargs:
kwarg_val = kwargs["where"]
if isinstance(kwarg_val, FastArray):
kwargs["where"] = kwarg_val.view(np.ndarray)

# NOTE: If the specified ufunc + inputs combination isn't supported by numpy either,
# as of numpy 1.17.x this call will end up raising a UFuncTypeError so the rest
# of the FastArray.__array_ufunc__ body (below) won't end up executing.
Expand Down
2 changes: 1 addition & 1 deletion riptable/rt_hstack.py
Original file line number Diff line number Diff line change
Expand Up @@ -582,7 +582,7 @@ def _possibly_add_concat_gap(coldim: int, inv_count, col, column_list, types):
if inv_count[0] > 0:
dtype = None
if len(types) > 1:
common = np.find_common_type(types, [])
common = np.result_type(*types)
# numeric, supported, ensure final vstack result dtype - strings will be automatic
if common.num <= 13:
# future optimization: store this for performance, reduce types list to final type
Expand Down
11 changes: 6 additions & 5 deletions riptable/rt_merge.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
List,
NamedTuple,
Optional,
Sequence,
Set,
Tuple,
Union,
Expand Down Expand Up @@ -1645,7 +1646,7 @@ def _normalize_keep(


def _extract_on_columns(
on: Optional[Union[str, Tuple[str, str], List[Union[str, Tuple[str, str]]]]],
on: Optional[Union[str, Tuple[str, str], Sequence[Union[str, Tuple[str, str]]]]],
side_on: Optional[Union[str, List[str]]],
for_left: bool,
on_param_name: str,
Expand Down Expand Up @@ -2080,7 +2081,7 @@ def merge_indices(
left: "Dataset",
right: "Dataset",
*,
on: Optional[Union[str, Tuple[str, str], List[Union[str, Tuple[str, str]]]]] = None,
on: Optional[Union[str, Tuple[str, str], Sequence[Union[str, Tuple[str, str]]]]] = None,
how: str = "left",
# TODO: Consider changing this to require_unique: Union[bool, Tuple[bool, bool]] -- the semantics would be clearer to users
validate: Optional[str] = None,
Expand Down Expand Up @@ -2292,7 +2293,7 @@ def merge_indices(
def merge2(
left: "Dataset",
right: "Dataset",
on: Optional[Union[str, Tuple[str, str], List[Union[str, Tuple[str, str]]]]] = None,
on: Optional[Union[str, Tuple[str, str], Sequence[Union[str, Tuple[str, str]]]]] = None,
left_on: Optional[Union[str, List[str]]] = None,
right_on: Optional[Union[str, List[str]]] = None,
how: str = "left",
Expand Down Expand Up @@ -3228,7 +3229,7 @@ def require_columns_present(
def merge_lookup(
left: "Dataset",
right: "Dataset",
on: Optional[Union[str, Tuple[str, str], List[Union[str, Tuple[str, str]]]]] = None,
on: Optional[Union[str, Tuple[str, str], Sequence[Union[str, Tuple[str, str]]]]] = None,
left_on: Optional[Union[str, List[str]]] = None,
right_on: Optional[Union[str, List[str]]] = None,
require_match: bool = False,
Expand Down Expand Up @@ -3574,7 +3575,7 @@ def merge_asof(
on: Optional[Union[str, Tuple[str, str]]] = None,
left_on: Optional[str] = None,
right_on: Optional[str] = None,
by: Optional[Union[str, Tuple[str, str], List[Union[str, Tuple[str, str]]]]] = None,
by: Optional[Union[str, Tuple[str, str], Sequence[Union[str, Tuple[str, str]]]]] = None,
left_by: Optional[Union[str, List[str]]] = None,
right_by: Optional[Union[str, List[str]]] = None,
suffixes: Optional[Tuple[str, str]] = None,
Expand Down
4 changes: 2 additions & 2 deletions riptable/rt_merge_asof.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from datetime import timedelta
import logging
from time import perf_counter_ns
from typing import TYPE_CHECKING, Any, Dict, List, Literal, Optional, Tuple, Union
from typing import TYPE_CHECKING, Any, Dict, List, Literal, Optional, Sequence, Tuple, Union
import warnings

import numpy as np
Expand Down Expand Up @@ -1254,7 +1254,7 @@ def merge_asof2(
on: Optional[Union[str, Tuple[str, str]]] = None,
left_on: Optional[str] = None,
right_on: Optional[str] = None,
by: Optional[Union[str, Tuple[str, str], List[Union[str, Tuple[str, str]]]]] = None,
by: Optional[Union[str, Tuple[str, str], Sequence[Union[str, Tuple[str, str]]]]] = None,
left_by: Optional[Union[str, List[str]]] = None,
right_by: Optional[Union[str, List[str]]] = None,
suffixes: Optional[Tuple[str, str]] = None,
Expand Down
4 changes: 2 additions & 2 deletions riptable/rt_numpy.py
Original file line number Diff line number Diff line change
Expand Up @@ -362,7 +362,7 @@ def get_common_dtype(x, y) -> np.dtype:

# NOTE: find_common_type has a bug where int32 num 7 gets flipped to int32 num 5.
if type1.num != type2.num:
common = np.find_common_type([type1, type2], [])
common = np.result_type(type1, type2)
else:
# for strings and unicode, pick the larger itemsize
if type1.itemsize >= type2.itemsize:
Expand Down Expand Up @@ -433,7 +433,7 @@ def _find_lossless_common_type(dt1: np.dtype, dt2: np.dtype) -> Union[np.dtype,
"""
Finds the lossless common type, or None if not found.
"""
dtc = np.find_common_type([dt1, dt2], [])
dtc = np.result_type(dt1, dt2)

if not np.issubdtype(dt1, np.number) or not np.issubdtype(dt2, np.number):
return dtc
Expand Down
4 changes: 3 additions & 1 deletion riptable/rt_sds.py
Original file line number Diff line number Diff line change
Expand Up @@ -324,7 +324,9 @@ def sds_listdir(path: AnyPath) -> List[str]:
return sds_os(os.listdir, path)


def sds_endswith(path: Union[bytes, str, List[Union[bytes, str]]], add: bool = False) -> Union[bool, str, List[str]]:
def sds_endswith(
path: Union[bytes, str, Sequence[Union[bytes, str]]], add: bool = False
) -> Union[bool, str, List[str]]:
"""
Returns true if the pathname ends with SDS extension, ``.sds``, unless `add` is enabled then it returns
the SDS pathname.
Expand Down
12 changes: 9 additions & 3 deletions riptable/rt_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -577,9 +577,15 @@ def _mbget_2dims(arr, idx):
ncols = arr.shape[1]
final_shape = (nrows, ncols)

# expand index array
# possible optimization: multiply on the smaller one first?
expanded_idx = np.repeat(idx, ncols) * ncols
# upcast to int64 so expanding does not go out of range of dtype
idx = idx.astype(np.int64)

# expand index array, safely keeping Invalids if given a FastArray
if isinstance(idx, TypeRegister.FastArray):
idx = np.where(idx.isna(), idx, idx * ncols)
else:
idx = idx * ncols
expanded_idx = np.repeat(idx, ncols)
expanded_idx += tile(arange(ncols), nrows)

# in as fortran
Expand Down
2 changes: 1 addition & 1 deletion riptable/tests/test_accumtable.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ def test_accum_cols_ratio(self):
}
)

accum = rt.accum_cols(data.Symb, [[data.Count, data.PlusMinus]], ["Rezult"])
accum = rt.accum_cols(data.Symb, [(data.Count, data.PlusMinus)], ["Rezult"])
accum_expected = rt.Dataset({"Symb": ["A", "B"], "Rezult": [8.0, -4.0]})
accum_expected.footer_set_values("Total", {"Symb": "Total", "Rezult": -16.0})
self.assertTrue((accum == accum_expected).all(axis=None))
Expand Down
2 changes: 1 addition & 1 deletion riptable/tests/test_categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -2610,7 +2610,7 @@ def nb_vector(x):
mcat = rt.Cat([c1, c2])
res = mcat.numba_apply(nb_vector, np.arange(3000))

assert res.equals(mcat.apply(nb_vector, np.arange(3000))), "Failed check 4"
assert res.equals(mcat.apply(nb_vector, np.arange(3000)), labels=False), "Failed check 4"

with pytest.raises(NotImplementedError):
mcat.numba_apply(nb_sum, np.arange(3000), keyword="keyword")
Expand Down
1 change: 1 addition & 0 deletions riptable/tests/test_categorical_groupby.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from enum import IntEnum
import warnings

import numpy as np
import pytest
Expand Down
Loading