rtosholdings · rtosholdings-bot · Apr 9, 2024 · Apr 9, 2024
diff --git a/pyproject.toml b/pyproject.toml
@@ -81,7 +81,13 @@ profile = "black"
 
 [tool.pytest.ini_options]
 xfail_strict = true
-filterwarnings = ["error::DeprecationWarning", "error:FutureWarning"]
+filterwarnings = [
+    "error::DeprecationWarning",
+    "error:FutureWarning",
+    # FastArray converts all scalars into 1-dim arrays; big change required
+    "ignore:Conversion of an array with ndim > 0 to a scalar is deprecated:DeprecationWarning",
+    "ignore:np.find_common_type is deprecated.:DeprecationWarning",
+]
 
 [tool.pydocstyle]
 convention = "numpy"

diff --git a/riptable/rt_accumtable.py b/riptable/rt_accumtable.py
@@ -1346,8 +1346,8 @@ def accum_cols(cat, val_list, name_list=None, filt_list=None, func_list="nansum"
     val_list : array or list of arrays
         Array or list of arrays that ``func_list`` is applied to.
         :py:func:`~.rt_accumtable.accum_cols` returns an array for each element in
-        ``val_list``. If an element of ``val_list`` is itself a two-element list of two
-        arrays, :py:func:`~.rt_accumtable.accum_cols` calculates a ratio between the
+        ``val_list``. If an element of ``val_list`` is itself a two-element list or tuple of
+        two arrays, :py:func:`~.rt_accumtable.accum_cols` calculates a ratio between the
         values calculated by a reducing function for the two arrays.
         :py:func:`~.rt_accumtable.accum_ratio` performs this calculation using ``cat``,
         the two arrays, the respective filter, and the respective reducing function as
@@ -1628,7 +1628,7 @@ def accum_cols(cat, val_list, name_list=None, filt_list=None, func_list="nansum"
     for val, name, filt, func in zip(val_list, name_list, filt_list, func_list):
         func_name = func
         func = getattr(accum, func_name)
-        if isinstance(val, list):  # Special cases
+        if isinstance(val, (list, tuple)):  # Special cases
             if isinstance(val[1], str):  # Named cases
                 if val[1] in "pP":  # accum_ratiop type
                     curr_data = accum_ratiop(cat, temp_cat, val[0], filt, func_name, "T", False, False)

diff --git a/riptable/rt_categorical.py b/riptable/rt_categorical.py
@@ -23,6 +23,7 @@
     List,
     Mapping,
     Optional,
+    Sequence,
     Tuple,
     Union,
 )
@@ -6355,7 +6356,7 @@ def hstack(cls, cats: Collection["Categorical"]) -> "Categorical":
     # ------------------------------------------------------------
     @classmethod
     def categories_equal(
-        cls, cats: List[Union["Categorical", np.ndarray, Tuple[np.ndarray, ...]]]
+        cls, cats: Sequence[Union["Categorical", np.ndarray, Tuple[np.ndarray, ...]]]
     ) -> Tuple[bool, List["Categorical"]]:
         """
         Check if every `Categorical` or array has the same categories (same unique values in the same order).

diff --git a/riptable/rt_dataset.py b/riptable/rt_dataset.py
@@ -1469,7 +1469,7 @@ def single_array(col_idx, row_idx):
         return self._copy(deep=False, rows=row_idx, cols=col_idx)
 
     # ------------------------------------------------------------
-    def _dataset_compare_check(self, func_name, lhs) -> Self:
+    def _dataset_compare_check(self, func_name: str, lhs: Dataset, include_label_columns: bool = True) -> Self:
         # comparison function will be called by an array the size of the indexes, either
         # interperetted as integers, or as categorical strings
         # if compared to string, make sure the string matches the string type in categories
@@ -1481,18 +1481,28 @@ def _dataset_compare_check(self, func_name, lhs) -> Self:
                 raise ValueError("The two Datasets have different lengths and cannot be compared")
             else:
                 # returns a new dataset
+
+                label_cols = self.label_get_names()
+                lhs_label_cols = lhs.label_get_names()
                 newds = {}
                 # for all columns that match
                 for colname in self.keys():
-                    # if the lhs dataset has the same column name, compare
-                    if hasattr(lhs, colname):
-                        # get the function reference for the comparison operator
-                        func = getattr(self[colname], func_name)
-                        # add the boolean array to the new dataset
-                        newds[colname] = func(lhs[colname])
-                    else:
+                    if not include_label_columns and colname in label_cols:
+                        continue
+
+                    if not (hasattr(lhs, colname)) or not include_label_columns and colname in lhs_label_cols:
                         newds[colname] = np.array([False] * nrows)
+                        continue
+
+                    # if the lhs dataset has the same column name, compare
+                    # get the function reference for the comparison operator
+                    func = getattr(self[colname], func_name)
+                    # add the boolean array to the new dataset
+                    newds[colname] = func(lhs[colname])
+
                 for colname in lhs:
+                    if not include_label_columns and colname in lhs_label_cols:
+                        continue
                     if colname not in newds:
                         newds[colname] = np.array([False] * nrows)
                 return type(self)(newds)
@@ -2000,7 +2010,7 @@ def imatrix_make(
     # -------------------------------------------------------
     # 2d arithmetic functions.
     def imatrix_y(
-        self, func: Union[Callable, str, List[Union[Callable, str]]], name: Optional[Union[str, List[str]]] = None
+        self, func: Union[Callable, str, Sequence[Union[Callable, str]]], name: Optional[Union[str, List[str]]] = None
     ) -> "Dataset":
         """
         Parameters
@@ -2047,7 +2057,7 @@ def imatrix_y(
     # -------------------------------------------------------
     # 2d arithmetic functions.
     def _imatrix_y_internal(
-        self, func, name: Optional[str] = None, showfilter: bool = True
+        self, func: Union[Callable, str], name: Optional[str] = None, showfilter: bool = True
     ) -> Optional[Tuple[Any, str, Callable]]:
         """
         Parameters
@@ -4842,7 +4852,7 @@ def merge(
     def merge2(
         self,
         right: "Dataset",
-        on: Optional[Union[str, Tuple[str, str], List[Union[str, Tuple[str, str]]]]] = None,
+        on: Optional[Union[str, Tuple[str, str], Sequence[Union[str, Tuple[str, str]]]]] = None,
         left_on: Optional[Union[str, List[str]]] = None,
         right_on: Optional[Union[str, List[str]]] = None,
         how: str = "left",
@@ -4882,7 +4892,7 @@ def merge_asof(
         on: Optional[Union[str, Tuple[str, str]]] = None,
         left_on: Optional[str] = None,
         right_on: Optional[str] = None,
-        by: Optional[Union[str, Tuple[str, str], List[Union[str, Tuple[str, str]]]]] = None,
+        by: Optional[Union[str, Tuple[str, str], Sequence[Union[str, Tuple[str, str]]]]] = None,
         left_by: Optional[Union[str, List[str]]] = None,
         right_by: Optional[Union[str, List[str]]] = None,
         suffixes: Optional[Tuple[str, str]] = None,
@@ -4923,7 +4933,7 @@ def merge_asof(
     def merge_lookup(
         self,
         right: "Dataset",
-        on: Optional[Union[str, Tuple[str, str], List[Union[str, Tuple[str, str]]]]] = None,
+        on: Optional[Union[str, Tuple[str, str], Sequence[Union[str, Tuple[str, str]]]]] = None,
         left_on: Optional[Union[str, List[str]]] = None,
         right_on: Optional[Union[str, List[str]]] = None,
         require_match: bool = False,
@@ -8134,7 +8144,7 @@ def make_dataset(coldict, val, newds):
         return ms
 
     # -------------------------------------------------------
-    def equals(self, other, axis: Optional[int] = None, labels: bool = False, exact: bool = False):
+    def equals(self, other, axis: Optional[int] = None, labels: bool = True, exact: bool = False):
         """
         Test whether two Datasets contain the same elements in each column.
         NaNs in the same location are considered equal.
@@ -8246,8 +8256,27 @@ def ds_isnan(ds):
 
         else:
             try:
-                result = self.apply_cols(isnan, labels=labels) & other.apply_cols(isnan, labels=labels)
-                result |= self == other
+                compare_result = self._dataset_compare_check("__eq__", other, include_label_columns=labels)
+                if len(compare_result) == 0:
+                    # Empty result: could be either DS({}).equals(DS({})) or DS({'a': []}).equals({'b': []})
+                    cols = set(self.keys())
+                    other_cols = set(other.keys())
+
+                    if not labels:
+                        cols = cols.difference(self.label_get_names())
+                        other_cols = other_cols.difference(other.label_get_names())
+
+                    return cols == other_cols if axis != 1 else result
+
+                fill_false = lambda col: zeros(len(col), dtype=bool)
+                self_nans = self.apply_cols(isnan, fill_value=fill_false, labels=labels)
+                other_nans = other.apply_cols(isnan, fill_value=fill_false, labels=labels)
+                nan_result = (
+                    (self_nans & other_nans) if len(self.values()) > len(other.values()) else (other_nans & self_nans)
+                )
+                result = Dataset({})
+                for colname in compare_result:
+                    result[colname] = nan_result[colname] | compare_result[colname]
             except:
                 result = False
                 if axis != 1:

diff --git a/riptable/rt_fastarray.py b/riptable/rt_fastarray.py
@@ -1325,6 +1325,19 @@ def __getitem__(self, fld) -> FastArray:
                     if len(self.shape) == 1:
                         return TypeRegister.MathLedger._MBGET(self, fld)
 
+            # Workaround for indexing a 2d FastArray with invalid indices (eg produced by merge functions)
+            # At each invalid index, give a row of NaN's / Inv's
+            if isinstance(fld, FastArray) and self.ndim == 2 and fld.ndim == 1:
+                f_inv = fld.isna()
+                if f_inv.any():
+                    # Temporarily remove invalid indices and get item
+                    fld = fld.fillna(0)
+                    result = TypeRegister.MathLedger._GETITEM(super(FastArray, self), fld)
+                    # Add back in null row where invalid indices were
+                    null_row = np.ndarray.__getitem__(self, 0).copy_invalid()
+                    result.__setitem__(f_inv, null_row)
+                    return result.view(FastArray)
+
             result = TypeRegister.MathLedger._GETITEM(super(FastArray, self), fld)
             return result.view(FastArray)
         else:
@@ -5177,6 +5190,13 @@ def __array_ufunc__(self, ufunc: Callable, method: str, *inputs: Any, **kwargs:
                     # replace out
                     kwargs["out"] = tuple(out_args)
 
+            # NumPy 1.25 permits overriding "where" (https://numpy.org/doc/stable/release/1.25.0-notes.html#array-likes-that-define-array-ufunc-can-now-override-ufuncs-if-used-as-where)
+            # TODO: should we just unwrap all FA kwargs?
+            if "where" in kwargs:
+                kwarg_val = kwargs["where"]
+                if isinstance(kwarg_val, FastArray):
+                    kwargs["where"] = kwarg_val.view(np.ndarray)
+
             # NOTE: If the specified ufunc + inputs combination isn't supported by numpy either,
             #       as of numpy 1.17.x this call will end up raising a UFuncTypeError so the rest
             #       of the FastArray.__array_ufunc__ body (below) won't end up executing.

diff --git a/riptable/rt_hstack.py b/riptable/rt_hstack.py
@@ -582,7 +582,7 @@ def _possibly_add_concat_gap(coldim: int, inv_count, col, column_list, types):
         if inv_count[0] > 0:
             dtype = None
             if len(types) > 1:
-                common = np.find_common_type(types, [])
+                common = np.result_type(*types)
                 # numeric, supported, ensure final vstack result dtype - strings will be automatic
                 if common.num <= 13:
                     # future optimization: store this for performance, reduce types list to final type

diff --git a/riptable/rt_merge.py b/riptable/rt_merge.py
@@ -21,6 +21,7 @@
     List,
     NamedTuple,
     Optional,
+    Sequence,
     Set,
     Tuple,
     Union,
@@ -1645,7 +1646,7 @@ def _normalize_keep(
 
 
 def _extract_on_columns(
-    on: Optional[Union[str, Tuple[str, str], List[Union[str, Tuple[str, str]]]]],
+    on: Optional[Union[str, Tuple[str, str], Sequence[Union[str, Tuple[str, str]]]]],
     side_on: Optional[Union[str, List[str]]],
     for_left: bool,
     on_param_name: str,
@@ -2080,7 +2081,7 @@ def merge_indices(
     left: "Dataset",
     right: "Dataset",
     *,
-    on: Optional[Union[str, Tuple[str, str], List[Union[str, Tuple[str, str]]]]] = None,
+    on: Optional[Union[str, Tuple[str, str], Sequence[Union[str, Tuple[str, str]]]]] = None,
     how: str = "left",
     # TODO: Consider changing this to require_unique: Union[bool, Tuple[bool, bool]] -- the semantics would be clearer to users
     validate: Optional[str] = None,
@@ -2292,7 +2293,7 @@ def merge_indices(
 def merge2(
     left: "Dataset",
     right: "Dataset",
-    on: Optional[Union[str, Tuple[str, str], List[Union[str, Tuple[str, str]]]]] = None,
+    on: Optional[Union[str, Tuple[str, str], Sequence[Union[str, Tuple[str, str]]]]] = None,
     left_on: Optional[Union[str, List[str]]] = None,
     right_on: Optional[Union[str, List[str]]] = None,
     how: str = "left",
@@ -3228,7 +3229,7 @@ def require_columns_present(
 def merge_lookup(
     left: "Dataset",
     right: "Dataset",
-    on: Optional[Union[str, Tuple[str, str], List[Union[str, Tuple[str, str]]]]] = None,
+    on: Optional[Union[str, Tuple[str, str], Sequence[Union[str, Tuple[str, str]]]]] = None,
     left_on: Optional[Union[str, List[str]]] = None,
     right_on: Optional[Union[str, List[str]]] = None,
     require_match: bool = False,
@@ -3574,7 +3575,7 @@ def merge_asof(
     on: Optional[Union[str, Tuple[str, str]]] = None,
     left_on: Optional[str] = None,
     right_on: Optional[str] = None,
-    by: Optional[Union[str, Tuple[str, str], List[Union[str, Tuple[str, str]]]]] = None,
+    by: Optional[Union[str, Tuple[str, str], Sequence[Union[str, Tuple[str, str]]]]] = None,
     left_by: Optional[Union[str, List[str]]] = None,
     right_by: Optional[Union[str, List[str]]] = None,
     suffixes: Optional[Tuple[str, str]] = None,

diff --git a/riptable/rt_merge_asof.py b/riptable/rt_merge_asof.py
@@ -7,7 +7,7 @@
 from datetime import timedelta
 import logging
 from time import perf_counter_ns
-from typing import TYPE_CHECKING, Any, Dict, List, Literal, Optional, Tuple, Union
+from typing import TYPE_CHECKING, Any, Dict, List, Literal, Optional, Sequence, Tuple, Union
 import warnings
 
 import numpy as np
@@ -1254,7 +1254,7 @@ def merge_asof2(
     on: Optional[Union[str, Tuple[str, str]]] = None,
     left_on: Optional[str] = None,
     right_on: Optional[str] = None,
-    by: Optional[Union[str, Tuple[str, str], List[Union[str, Tuple[str, str]]]]] = None,
+    by: Optional[Union[str, Tuple[str, str], Sequence[Union[str, Tuple[str, str]]]]] = None,
     left_by: Optional[Union[str, List[str]]] = None,
     right_by: Optional[Union[str, List[str]]] = None,
     suffixes: Optional[Tuple[str, str]] = None,

diff --git a/riptable/rt_numpy.py b/riptable/rt_numpy.py
@@ -362,7 +362,7 @@ def get_common_dtype(x, y) -> np.dtype:
 
     # NOTE: find_common_type has a bug where int32 num 7 gets flipped to int32 num 5.
     if type1.num != type2.num:
-        common = np.find_common_type([type1, type2], [])
+        common = np.result_type(type1, type2)
     else:
         # for strings and unicode, pick the larger itemsize
         if type1.itemsize >= type2.itemsize:
@@ -433,7 +433,7 @@ def _find_lossless_common_type(dt1: np.dtype, dt2: np.dtype) -> Union[np.dtype,
     """
     Finds the lossless common type, or None if not found.
     """
-    dtc = np.find_common_type([dt1, dt2], [])
+    dtc = np.result_type(dt1, dt2)
 
     if not np.issubdtype(dt1, np.number) or not np.issubdtype(dt2, np.number):
         return dtc

diff --git a/riptable/rt_sds.py b/riptable/rt_sds.py
@@ -324,7 +324,9 @@ def sds_listdir(path: AnyPath) -> List[str]:
     return sds_os(os.listdir, path)
 
 
-def sds_endswith(path: Union[bytes, str, List[Union[bytes, str]]], add: bool = False) -> Union[bool, str, List[str]]:
+def sds_endswith(
+    path: Union[bytes, str, Sequence[Union[bytes, str]]], add: bool = False
+) -> Union[bool, str, List[str]]:
     """
     Returns true if the pathname ends with SDS extension, ``.sds``, unless `add` is enabled then it returns
     the SDS pathname.

diff --git a/riptable/rt_utils.py b/riptable/rt_utils.py
@@ -577,9 +577,15 @@ def _mbget_2dims(arr, idx):
     ncols = arr.shape[1]
     final_shape = (nrows, ncols)
 
-    # expand index array
-    # possible optimization: multiply on the smaller one first?
-    expanded_idx = np.repeat(idx, ncols) * ncols
+    # upcast to int64 so expanding does not go out of range of dtype
+    idx = idx.astype(np.int64)
+
+    # expand index array, safely keeping Invalids if given a FastArray
+    if isinstance(idx, TypeRegister.FastArray):
+        idx = np.where(idx.isna(), idx, idx * ncols)
+    else:
+        idx = idx * ncols
+    expanded_idx = np.repeat(idx, ncols)
     expanded_idx += tile(arange(ncols), nrows)
 
     # in as fortran

diff --git a/riptable/tests/test_accumtable.py b/riptable/tests/test_accumtable.py
@@ -29,7 +29,7 @@ def test_accum_cols_ratio(self):
             }
         )
 
-        accum = rt.accum_cols(data.Symb, [[data.Count, data.PlusMinus]], ["Rezult"])
+        accum = rt.accum_cols(data.Symb, [(data.Count, data.PlusMinus)], ["Rezult"])
         accum_expected = rt.Dataset({"Symb": ["A", "B"], "Rezult": [8.0, -4.0]})
         accum_expected.footer_set_values("Total", {"Symb": "Total", "Rezult": -16.0})
         self.assertTrue((accum == accum_expected).all(axis=None))

diff --git a/riptable/tests/test_categorical.py b/riptable/tests/test_categorical.py
@@ -2610,7 +2610,7 @@ def nb_vector(x):
         mcat = rt.Cat([c1, c2])
         res = mcat.numba_apply(nb_vector, np.arange(3000))
 
-        assert res.equals(mcat.apply(nb_vector, np.arange(3000))), "Failed check 4"
+        assert res.equals(mcat.apply(nb_vector, np.arange(3000)), labels=False), "Failed check 4"
 
         with pytest.raises(NotImplementedError):
             mcat.numba_apply(nb_sum, np.arange(3000), keyword="keyword")

diff --git a/riptable/tests/test_categorical_groupby.py b/riptable/tests/test_categorical_groupby.py
@@ -1,4 +1,5 @@
 from enum import IntEnum
+import warnings
 
 import numpy as np
 import pytest