From 438b085827e586de4ec8d9abd424ec7a6cd90f6f Mon Sep 17 00:00:00 2001 From: OrestZborowski-SIG <13220395+OrestZborowski-SIG@users.noreply.github.com> Date: Wed, 31 Jan 2024 10:44:51 -0500 Subject: [PATCH] v1.14.4 (#368) * v1.14.4-rc1 * v1.14.4-rc2 --------- Co-authored-by: rtosholdings-bot --- .github/workflows/python-package.yml | 4 +- conda_recipe/meta.yaml | 7 +- dev_tools/docstring_xfails.txt | 34 - dev_tools/gen_requirements.py | 3 +- dev_tools/validate_docstrings.py | 3 +- docs/source/conf.py | 8 + .../tutorial_cat_adv_instantiation.rst | 6 +- .../source/tutorial/tutorial_categoricals.rst | 6 +- docs/source/tutorial/tutorial_datetimes.rst | 18 +- docs/source/tutorial/tutorial_visualize.rst | 56 +- pyproject.toml | 58 +- riptable/Utils/display_options.py | 19 +- riptable/rt_accumtable.py | 14 +- riptable/rt_categorical.py | 617 ++++---- riptable/rt_dataset.py | 25 +- riptable/rt_datetime.py | 284 +--- riptable/rt_display.py | 30 +- riptable/rt_merge.py | 78 +- riptable/rt_numpy.py | 1258 ++++++++++------- riptable/rt_pdataset.py | 3 +- riptable/rt_sds.py | 14 +- riptable/rt_stats.py | 10 +- riptable/rt_struct.py | 33 +- riptable/rt_timezone.py | 6 +- riptable/rt_utils.py | 26 +- riptable/tests/test_base_function.py | 50 + riptable/tests/test_dataset.py | 14 + riptable/tests/test_saveload.py | 126 ++ riptable/tests/test_struct.py | 26 +- 29 files changed, 1600 insertions(+), 1236 deletions(-) diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index 013db43..92cb6b2 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -194,7 +194,7 @@ jobs: - name: Test riptable run: | set -ex - python -m riptable.tests.run + python -m pytest --pyargs riptable.tests # disable tooling integration tests until they work # ipython -m pytest riptable/test_tooling_integration # disable hypothesis tests until they run faster, are more consistent, and are easier to investigate @@ -261,7 +261,7 @@ jobs: conda list - name: Test with pytest run: | - python -m riptable.tests.run + python -m pytest --pyargs riptable.tests - name: Tooling integration tests run: | echo "DISABLED until tooling tests can be updated" diff --git a/conda_recipe/meta.yaml b/conda_recipe/meta.yaml index 85b6594..96b5b92 100644 --- a/conda_recipe/meta.yaml +++ b/conda_recipe/meta.yaml @@ -5,8 +5,7 @@ package: build: number: 0 noarch: python - # Use Python installed in host environment. - script: "{{ PYTHON }} -m pip install -v --no-deps --ignore-installed ." + script: "pip install -v --no-deps --no-build-isolation ." source: path: .. @@ -16,13 +15,13 @@ requirements: - python {{ python}} - setuptools_scm run: - - python - ansi2html >=1.5.2 - numpy >=1.23 - numba >=0.56.2 - pandas >=1.0,<3.0 + - python - python-dateutil - - riptide_cpp >=1.16.1,<2 # run with any (compatible) version in this range + - riptide_cpp >=1.16.3,<2 # run with any (compatible) version in this range about: home: https://github.com/rtosholdings/riptable diff --git a/dev_tools/docstring_xfails.txt b/dev_tools/docstring_xfails.txt index d3eeefe..bd15360 100644 --- a/dev_tools/docstring_xfails.txt +++ b/dev_tools/docstring_xfails.txt @@ -21,7 +21,6 @@ riptable.rt_bin.cut riptable.rt_bin.qcut riptable.rt_bin.quantile riptable.rt_categorical.CatZero -riptable.rt_categorical.Categorical riptable.rt_categorical.Categorical.align riptable.rt_categorical.Categorical.apply riptable.rt_categorical.Categorical.apply_nonreduce @@ -41,10 +40,7 @@ riptable.rt_categorical.Categorical.category_replace riptable.rt_categorical.Categorical.copy riptable.rt_categorical.Categorical.copy_invalid riptable.rt_categorical.Categorical.expand_any -riptable.rt_categorical.Categorical.expand_array riptable.rt_categorical.Categorical.expand_dict -riptable.rt_categorical.Categorical.fill_backward -riptable.rt_categorical.Categorical.fill_forward riptable.rt_categorical.Categorical.fill_invalid riptable.rt_categorical.Categorical.filtered_set_name riptable.rt_categorical.Categorical.filtered_string @@ -66,7 +62,6 @@ riptable.rt_categorical.Categorical.isfiltered riptable.rt_categorical.Categorical.isin riptable.rt_categorical.Categorical.ismultikey riptable.rt_categorical.Categorical.isna -riptable.rt_categorical.Categorical.isnan riptable.rt_categorical.Categorical.isnotnan riptable.rt_categorical.Categorical.issinglekey riptable.rt_categorical.Categorical.lock @@ -84,7 +79,6 @@ riptable.rt_categorical.Categorical.nunique riptable.rt_categorical.Categorical.one_hot_encode riptable.rt_categorical.Categorical.set_name riptable.rt_categorical.Categorical.set_valid -riptable.rt_categorical.Categorical.shift riptable.rt_categorical.Categorical.shift_cat riptable.rt_categorical.Categorical.shrink riptable.rt_categorical.Categorical.sort_gb @@ -644,7 +638,6 @@ riptable.rt_numpy.abs riptable.rt_numpy.absolute riptable.rt_numpy.all riptable.rt_numpy.any -riptable.rt_numpy.arange riptable.rt_numpy.argmax riptable.rt_numpy.argmin riptable.rt_numpy.argsort @@ -667,12 +660,9 @@ riptable.rt_numpy.cumprod riptable.rt_numpy.cumsum riptable.rt_numpy.diff riptable.rt_numpy.double -riptable.rt_numpy.empty -riptable.rt_numpy.empty_like riptable.rt_numpy.float32 riptable.rt_numpy.float64 riptable.rt_numpy.floor -riptable.rt_numpy.full riptable.rt_numpy.get_common_dtype riptable.rt_numpy.get_dtype riptable.rt_numpy.groupby @@ -688,14 +678,7 @@ riptable.rt_numpy.int64 riptable.rt_numpy.int8 riptable.rt_numpy.interp riptable.rt_numpy.interp_extrap -riptable.rt_numpy.isfinite -riptable.rt_numpy.isinf riptable.rt_numpy.ismember -riptable.rt_numpy.isnan -riptable.rt_numpy.isnanorzero -riptable.rt_numpy.isnotfinite -riptable.rt_numpy.isnotinf -riptable.rt_numpy.isnotnan riptable.rt_numpy.lexsort riptable.rt_numpy.log riptable.rt_numpy.log10 @@ -715,7 +698,6 @@ riptable.rt_numpy.mask_xor riptable.rt_numpy.mask_xori riptable.rt_numpy.max riptable.rt_numpy.maximum -riptable.rt_numpy.mean riptable.rt_numpy.median riptable.rt_numpy.min riptable.rt_numpy.min_scalar_type @@ -725,30 +707,20 @@ riptable.rt_numpy.nan_to_num riptable.rt_numpy.nanargmax riptable.rt_numpy.nanargmin riptable.rt_numpy.nanmax -riptable.rt_numpy.nanmean riptable.rt_numpy.nanmedian riptable.rt_numpy.nanmin riptable.rt_numpy.nanpercentile -riptable.rt_numpy.nanstd -riptable.rt_numpy.nansum -riptable.rt_numpy.nanvar -riptable.rt_numpy.ones -riptable.rt_numpy.ones_like riptable.rt_numpy.percentile riptable.rt_numpy.power riptable.rt_numpy.putmask riptable.rt_numpy.reindex_fast -riptable.rt_numpy.repeat riptable.rt_numpy.reshape riptable.rt_numpy.round riptable.rt_numpy.searchsorted riptable.rt_numpy.single riptable.rt_numpy.sort riptable.rt_numpy.sortinplaceindirect -riptable.rt_numpy.std riptable.rt_numpy.str_ -riptable.rt_numpy.sum -riptable.rt_numpy.tile riptable.rt_numpy.transpose riptable.rt_numpy.trunc riptable.rt_numpy.uint0 @@ -756,12 +728,7 @@ riptable.rt_numpy.uint16 riptable.rt_numpy.uint32 riptable.rt_numpy.uint64 riptable.rt_numpy.uint8 -riptable.rt_numpy.unique -riptable.rt_numpy.var riptable.rt_numpy.vstack -riptable.rt_numpy.where -riptable.rt_numpy.zeros -riptable.rt_numpy.zeros_like riptable.rt_pdataset.PDataset riptable.rt_pdataset.PDataset.hstack riptable.rt_pdataset.PDataset.igroupby @@ -864,7 +831,6 @@ riptable.rt_struct.Struct.hstack riptable.rt_struct.Struct.info riptable.rt_struct.Struct.is_valid_colname riptable.rt_struct.Struct.items -riptable.rt_struct.Struct.key_search riptable.rt_struct.Struct.keys riptable.rt_struct.Struct.label_as_dict riptable.rt_struct.Struct.label_filter diff --git a/dev_tools/gen_requirements.py b/dev_tools/gen_requirements.py index 3951d81..ef20edc 100644 --- a/dev_tools/gen_requirements.py +++ b/dev_tools/gen_requirements.py @@ -52,7 +52,8 @@ def is_python(major: int, minor: int) -> bool: ] + toolchain_reqs # PyPI setup build requirements. -# Most everything else will be specified in setup.py. +# Most everything *should* be in pyproject.toml, but some packages +# need to be set up manually here. pypi_reqs = [ "build", # PEP-517 py build frontend _BENCHMARK_REQ, # PyPI package doesn't exist diff --git a/dev_tools/validate_docstrings.py b/dev_tools/validate_docstrings.py index 3fbdfb5..85b7db7 100644 --- a/dev_tools/validate_docstrings.py +++ b/dev_tools/validate_docstrings.py @@ -51,7 +51,7 @@ Validator, validate, ) -import pandas + import riptable from riptable.Utils.common import cached_weakref_property @@ -98,7 +98,6 @@ IMPORT_CONTEXT = { "np": numpy, - "pd": pandas, "rt": riptable, } diff --git a/docs/source/conf.py b/docs/source/conf.py index 7f307ef..d15c7b6 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -87,8 +87,16 @@ def parse_filters(filterpath: str) -> typing.Tuple[typing.Optional[list[str]], t "sphinx_design", ] +# Put the list of valid time zones into docstrings using rst_prolog. +from riptable import TimeZone + +tz_list = 'Supported timezones: "' + '", "'.join(TimeZone.valid_timezones) + '"' +current_tzs_note = f"{tz_list}. To see supported timezones, use ``rt.TimeZone.valid_timezones``." + + rst_prolog = f""" .. |rtosholdings_docs| replace:: {os.getenv("RTOSHOLDINGS_DOCS", "rtosholdings-docs@sig.com")} +.. |To see supported timezones, use ``rt.TimeZone.valid_timezones``.| replace:: {current_tzs_note} """ diff --git a/docs/source/tutorial/tutorial_cat_adv_instantiation.rst b/docs/source/tutorial/tutorial_cat_adv_instantiation.rst index a8b1638..4144cdc 100644 --- a/docs/source/tutorial/tutorial_cat_adv_instantiation.rst +++ b/docs/source/tutorial/tutorial_cat_adv_instantiation.rst @@ -1,9 +1,9 @@ -A Useful Way to Instantiate a Categorical -***************************************** +Build a Categorical Iteratively +******************************* It can sometimes be useful to instantiate a Categorical with only one -category, then fill it in as needed. +category, then fill it in iteratively as needed. For example, let’s say we have a Dataset with a column that has a lot of categories, and we want to create a new Categorical column that keeps diff --git a/docs/source/tutorial/tutorial_categoricals.rst b/docs/source/tutorial/tutorial_categoricals.rst index 9bc1b5f..c7574e2 100644 --- a/docs/source/tutorial/tutorial_categoricals.rst +++ b/docs/source/tutorial/tutorial_categoricals.rst @@ -1172,7 +1172,11 @@ resulting Dataset is expanded. Note that until a reported bug is fixed, column names might not persist through grouping operations. -For more in-depth information about Categoricals, see the `Categoricals User Guide `. +For more in-depth information about Categoricals, see: + +- Appendix: :doc:`Build a Categorical Iteratively ` +- :doc:`Categoricals User Guide ` +- API Reference: :py:class:`~.rt_categorical.Categorical` class In the next section, `Accums `__, we look at another way to do multi-key groupings with fancier output. diff --git a/docs/source/tutorial/tutorial_datetimes.rst b/docs/source/tutorial/tutorial_datetimes.rst index c256f92..e411f34 100644 --- a/docs/source/tutorial/tutorial_datetimes.rst +++ b/docs/source/tutorial/tutorial_datetimes.rst @@ -149,19 +149,13 @@ strings. Strings are common when the data is from, say, a CSV file. Unlike ``Date`` objects, ``DateTimeNano``\ s are time-zone-aware. When you create a ``DateTimeNano``, you need to specify the time zone of -origin with the ``from_tz`` argument. Since Riptable is mainly used for -financial market data, its time zone options are limited to NYC, DUBLIN, -and (as of Riptable 1.3.6) Australia/Sydney, plus GMT and UTC (which is -an alias for GMT). - -(If you’re wondering why ‘Australia/Sydney’ isn’t abbreviated, it’s -because Riptable uses the standard time zone name from the `tz -database `__. In the future, -Riptable will support only the `standard -names `__ -in the tz database.) +origin with the ``from_tz`` argument. Riptable supports time zones in +certain areas of interest, such as New York, Dublin, Sydney, Tokyo, and +Hong Kong (more are added as needed). It also supports GMT and UTC. -:: +To see the list of valid time zones, use ``rt.TimeZone.valid_timezones``. + +Here, we create a ``DateTimeNano`` with 'GMT' as the time zone of origin:: >>> rt.DateTimeNano(['20210101 09:31:15', '20210519 05:21:17'], from_tz='GMT') DateTimeNano(['20210101 04:31:15.000000000', '20210519 01:21:17.000000000'], to_tz='NYC') diff --git a/docs/source/tutorial/tutorial_visualize.rst b/docs/source/tutorial/tutorial_visualize.rst index 0f770d0..894dc30 100644 --- a/docs/source/tutorial/tutorial_visualize.rst +++ b/docs/source/tutorial/tutorial_visualize.rst @@ -6,7 +6,7 @@ Matplotlib, to create visualizations of your data. You can also take advantage of the plotting and HTML styling tools offered by Pandas. In this section we’ll look at a couple of simple examples using -Matplotlib, Pandas, and Playa. +Matplotlib and Pandas. :: @@ -202,59 +202,9 @@ your Dataset for the rendering:: - -Groupscatter Plots with Playa ------------------------------ -Playa’s ``GroupScatter()`` method groups data into buckets based on -x-values and returns a Matplotlib plot summarizing the data. - -:: - - from playa.plot import GroupScatter - -Make a noisier price signal -^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -:: - - >>> ds.NoisyPrice = ds.Price + rng.normal(0, 10, ds.shape[0]) - -A regular Matplotlib scatter plot, for comparison -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -:: - - >>> num_rows = int(rt.ceil(len(symbols)/2)) - >>> fig, axes = plt.subplots(num_rows, 2, figsize=(20, 5 * num_rows)) - >>> for (ax, symbol) in zip(axes.flatten(), symbols): - ... f = ds.Symbol==symbol - ... ax.scatter(ds.Time[f], ds.NoisyPrice[f]) - ... ax.grid() - ... ax.set_xlabel('Time') - ... ax.set_ylabel('Price') - ... ax.set_title(f'{symbol} Noisy Stock Price by Time') - >>> plt.show() - -.. image:: output_25_0.png - - -Now a GroupScatter for each one, you can see how it clarifies the point cloud and reveals the shape. -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -:: - - >>> fig, axes = plt.subplots(num_rows, 2, figsize=(20, 5 * num_rows)) - >>> for (ax, symbol) in zip(axes.flatten(), symbols): - ... f = ds.Symbol==symbol - ... gs = GroupScatter(ds.Time[f].hour, ds.NoisyPrice[f]) - ... gs.plot(title=f'{symbol} Noisy Stock Price Over Time', x_label='Hour of the Day', y_label='Price', ax=ax) - >>> plt.show() - -.. image:: output_27_0.png - -This was just a brief introduction – check out the Matpotlib, Pandas, -and Playa documentation for more details and possibilities. +This was a brief introduction – check out the Matpotlib and Pandas documentation for +more details and possibilities. Next we cover useful tools for working with ``NaN``\ s and other missing values: `Working with Missing Data `__. diff --git a/pyproject.toml b/pyproject.toml index ead3129..c357cce 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,7 +1,3 @@ -[build-system] -requires = ["setuptools>=65", "setuptools_scm[toml]>=7.1"] -build-backend = "setuptools.build_meta" - [project] name = "riptable" description = "Python Package for riptable studies framework" @@ -9,6 +5,15 @@ readme = "README.md" license = { file = "LICENSE" } authors = [{ name = "RTOS Holdings", email = "rtosholdings-bot@sig.com" }] requires-python = ">=3.9" +dynamic = ["version"] +dependencies = [ + "ansi2html >=1.5.2", + "numba >=0.56.2", + "numpy >=1.23", + "pandas >=1.0,<3.0", + "python-dateutil", + "riptide_cpp >=1.16.3,<2", +] classifiers = [ "Development Status :: 4 - Beta", "Programming Language :: Python :: 3", @@ -18,32 +23,30 @@ classifiers = [ "License :: OSI Approved :: BSD License", "Operating System :: OS Independent", ] -dependencies = [ - "ansi2html >=1.5.2", - "numba >=0.56.2", - "numpy >=1.23", - "pandas >=1.0,<3.0", - "python-dateutil", - "riptide_cpp >=1.16.1,<2", -] -dynamic = ["version"] [project.urls] Repository = "https://github.com/rtosholdings/riptable" Documentation = "https://riptable.readthedocs.io/en/stable/" +[build-system] +requires = [ # + "setuptools >=65", + "setuptools_scm[toml] >=7.1", +] +build-backend = "setuptools.build_meta" + +[tool.setuptools_scm] +version_scheme = "post-release" +local_scheme = "node-and-timestamp" +write_to = "riptable/_version.py" +write_to_template = "__version__ = '{version}'" + [tool.setuptools] include-package-data = true [tool.setuptools.packages.find] include = ["riptable*"] -[tool.setuptools_scm] -"version_scheme" = "post-release" -"local_scheme" = "node-and-timestamp" -"write_to" = "riptable/_version.py" -"write_to_template" = "__version__ = '{version}'" - # NOTE: you have to use single-quoted strings in TOML for regular expressions. # It's the equivalent of r-strings in Python. Multiline strings are treated as # verbose regular expressions by Black. Use [ ] to denote a significant space @@ -51,7 +54,9 @@ include = ["riptable*"] [tool.black] line-length = 120 -target-version = ["py38", "py39", "py310"] +skip-string-normalization = true +target-version = ["py39", "py310", "py311"] +include = '\.pyi?$' exclude = ''' /( \.cache @@ -117,6 +122,7 @@ exclude_lines = [ # ruff is a Python linter implemented in Rust: https://github.com/charliermarsh/ruff [tool.ruff] +target-version = "py310" line-length = 120 # Rule categories (or individual rules) to enable. @@ -147,9 +153,10 @@ ignore = [ # Temporarily-disabled rules. # Fix these issues and un-ignore these rules as soon as feasible. ##### - "D101", # missing docstring in public class - "D102", # missing docstring in public method - # (add others here if/as needed) + "D101", # missing docstring in public class + "D102", # missing docstring in public method + "D202", # No blank lines allowed after function docstring + "D205", # 1 blank line required between summary line and description ##### # Rules we really want to disable (now and maybe forever). @@ -159,11 +166,14 @@ ignore = [ # instead of `np.logical_not()`, the pattern is detected by this rule # and converted to an "is"-based check that won't work with numpy/riptable arrays. "E712", # Comparison to `False` should be `cond is False` + # Q000 uses `flake8-quotes.inline-quotes` and seems to prefer double quote. But in our black + # configuration above we preserve single quotes (skip-string-normalization = true). + "Q000", # Single quotes found but double quotes preferred ] # Group output by file. This format is typically most useful for development; # override the format via the command-line when running `ruff` from within a CI job. -format = "grouped" +output-format = "grouped" [tool.ruff.flake8-quotes] docstring-quotes = "double" diff --git a/riptable/Utils/display_options.py b/riptable/Utils/display_options.py index c4bc503..f9fd850 100644 --- a/riptable/Utils/display_options.py +++ b/riptable/Utils/display_options.py @@ -1,7 +1,7 @@ """Display options for formatting and displaying numeric values, datasets, and multisets.""" import os from json import dump, load -from typing import Optional, Union +from typing import Optional, Sequence, Union from ..rt_enum import DisplayNumberSeparator, TypeRegister from .appdirs import user_config_dir @@ -231,6 +231,23 @@ class DisplayOptions(object): Prefix for column names to indicate that they are groupby keys (`str`, default "*"). """ + HTML_CUSTOM_TABLE_CSS: Optional[Union[str, Sequence[str]]] = None + """ + Custom CSS styles to apply to table elements (`str` or `list` of `str`, default `None`). + + Examples + -------- + This example demonstrates how to style the table body headers and cells with a solid border. + + The optional ``!important`` forcibly overrides any Jupyter styling. + + >>> from riptable.Utils.display_options import DisplayOptions + >>> DisplayOptions.HTML_CUSTOM_TABLE_CSS = [ + "tbody thead, td {border-style: solid !important}", + ] + >>> rt.Dataset({"A": [0, 6, 9], "B": [1.2, 3.1, 9.6], "C": [-1.6, 2.7, 4.6], "D": [2.4, 6.2, 19.2]}) # doctest: +SKIP + """ + # TODO: split the json config loader to separate files so that new display formatting # can be added more easily for future data types diff --git a/riptable/rt_accumtable.py b/riptable/rt_accumtable.py index a42704a..512015e 100644 --- a/riptable/rt_accumtable.py +++ b/riptable/rt_accumtable.py @@ -988,13 +988,13 @@ def accum_ratio( val1 = cat2 cat2 = None if filt1 is None: - filt1 = full(val1.shape[0], True, dtype=bool) # This was playa.utils.truecol + filt1 = full(val1.shape[0], True, dtype=bool) if filt2 is None: filt2 = filt1 if func2 is None: func2 = func1 if cat2 is None: - cat2 = Categorical(full(val1.shape[0], 1, dtype=np.int8), ["NotGrouped"]) # This was playa.utils.onescol + cat2 = Categorical(full(val1.shape[0], 1, dtype=np.int8), ["NotGrouped"]) # Handle name collisions for key in ["Numer", "Denom", "Ratio"]: @@ -1265,7 +1265,7 @@ def accum_ratiop( """ # Handle missing inputs if val is None: - val = full(cat1.shape[0], 1, dtype=np.float64) # This was playa.utils.onescol + val = full(cat1.shape[0], 1, dtype=np.float64) if filter is None: if filt is not None: # Temporary until deprecated warnings.warn( @@ -1274,9 +1274,9 @@ def accum_ratiop( ) filter = filt else: - filter = full(val.shape[0], True, dtype=bool) # This was playa.utils.truecol + filter = full(val.shape[0], True, dtype=bool) if cat2 is None: - cat2 = Categorical(full(val.shape[0], 1, dtype=np.int8), ["NotGrouped"]) # This was playa.utils.onescol + cat2 = Categorical(full(val.shape[0], 1, dtype=np.int8), ["NotGrouped"]) # Compute accum accum = AccumTable(cat1, cat2) @@ -1613,14 +1613,14 @@ def accum_cols(cat, val_list, name_list=None, filt_list=None, func_list="nansum" if filt_list is None: val_fst = val_list[0] shape = val_fst.shape[0] if isinstance(val_fst, np.ndarray) else val_fst[0].shape[0] - filt_list = full(shape, True, dtype=bool) # This was playa.utils.truecol + filt_list = full(shape, True, dtype=bool) if not isinstance(func_list, list): func_list = [func_list for _ in val_list] if not isinstance(filt_list, list): filt_list = [filt_list for _ in val_list] # Compute accum - temp_cat = Categorical(full(cat.shape[0], 1, dtype=np.int8), ["NotGrouped"]) # This was playa.utils.onescol + temp_cat = Categorical(full(cat.shape[0], 1, dtype=np.int8), ["NotGrouped"]) accum = Accum2(cat, temp_cat) for val, name, filt, func in zip(val_list, name_list, filt_list, func_list): diff --git a/riptable/rt_categorical.py b/riptable/rt_categorical.py index 59d29e4..a607449 100644 --- a/riptable/rt_categorical.py +++ b/riptable/rt_categorical.py @@ -1281,10 +1281,10 @@ def build_dicts_enum(cls, enum): # ------------------------------------------------------------ class Categorical(GroupByOps, FastArray): """ - A `Categorical` efficiently stores an array of repeated strings and is used for - groupby operations. + A :py:class:`~.rt_categorical.Categorical` efficiently stores an array of repeated + strings and is used for groupby operations. - Riptable `Categorical` objects have two related uses: + Riptable :py:class:`~.rt_categorical.Categorical` objects have two related uses: - They efficiently store string (or other large dtype) arrays that have repeated values. The repeated values are partitioned into groups (a.k.a. categories), @@ -1292,9 +1292,9 @@ class Categorical(GroupByOps, FastArray): stored and operated on more efficiently. - They're Riptable's class for doing groupby operations. A method applied to a - `Categorical` is applied to each group separately. + :py:class:`~.rt_categorical.Categorical` is applied to each group separately. - A `Categorical` is typically created from a list of strings: + A :py:class:`~.rt_categorical.Categorical` is typically created from a list of strings: >>> c = rt.Categorical(["b", "a", "b", "a", "c", "c", "b"]) >>> c @@ -1304,17 +1304,18 @@ class Categorical(GroupByOps, FastArray): The output shows: - - The `Categorical` values. These are grouped into unique categories (here, "a", - "b", and "c"), which are also stored in the `Categorical` (see below). + - The :py:class:`~.rt_categorical.Categorical` values. These are grouped into unique + categories (here, "a", "b", and "c"), which are also stored in the + :py:class:`~.rt_categorical.Categorical` (see below). - The integer mapping codes (also called bins). Each integer is mapped to a unique category (here, 1 is mapped to "a", 2 is mapped to "b", and 3 is mapped to "c"). - Because these codes can also be used to index into the `Categorical`, - they're also referred to as indices. By default, the index is 1-based, with 0 - reserved for Filtered values. + Because these codes can also be used to index into the + :py:class:`~.rt_categorical.Categorical`, they're also referred to as indices. By + default, the index is 1-based, with ``0`` reserved for filtered values. - The unique categories. Each category represents a group for groupby operations. - Use `Categorical` objects to perform aggregations over arbitrary arrays of the same - dimension as the `Categorical`: + Use :py:class:`~.rt_categorical.Categorical` objects to perform aggregations over + arbitrary arrays of the same dimension as the :py:class:`~.rt_categorical.Categorical`: >>> c = rt.Categorical(["b", "a", "b", "a", "c", "c", "b"]) >>> ints = rt.FA([3, 10, 2, 5, 4, 1, 1]) @@ -1330,11 +1331,13 @@ class Categorical(GroupByOps, FastArray): **Multi-Key Categoricals** - The `Categorical` above is a single-key `Categorical` -- it groups one array of - values into keys (the categories) for groupby operations. + The :py:class:`~.rt_categorical.Categorical` above is a single-key + :py:class:`~.rt_categorical.Categorical` -- it groups one array of values into keys + (the categories) for groupby operations. - Multi-key `Categorical` objects let you create and operate on groupings based on - multiple associated categories. The associated keys form a group: + Multi-key :py:class:`~.rt_categorical.Categorical` objects let you create and + operate on groupings based on multiple associated categories. The associated keys + form a group: >>> strs = rt.FastArray(["a", "b", "b", "a", "b", "a"]) >>> ints = rt.FastArray([2, 1, 1, 2, 1, 1]) @@ -1350,22 +1353,24 @@ class Categorical(GroupByOps, FastArray): b 1 3 a 1 1 - [3 rows x 3 columns] total bytes: 27.0 B + [3 rows x 3 columns] total bytes: 39.0 B **Filtered Values and Categories** - Filter values and categories to exclude them from operations on the `Categorical`. + Filter values and categories to exclude them from operations on the + :py:class:`~.rt_categorical.Categorical`. - `Categorical` objects can be filtered when they're created or anytime afterwards. - Because filtered items are mapped to 0 in the integer mapping array, filters can be - used only in base-1 `Categorical` objects. + :py:class:`~.rt_categorical.Categorical` objects can be filtered when they're + created or anytime afterwards. Because filtered items are mapped to ``0`` in the + integer mapping array, filters can be used only in base-1 + :py:class:`~.rt_categorical.Categorical` objects. Filters can also be applied on a one-off basis at the time of an operation. See the Filtering topic under More About Categoricals for examples. **More About Categorials** - For more about using `Categorical` objects, see the + For more about using :py:class:`~.rt_categorical.Categorical` objects, see the :doc:`Categoricals ` section of the :doc:`/tutorial/tutorial` or these more in-depth topics: @@ -1382,23 +1387,25 @@ class Categorical(GroupByOps, FastArray): Parameters ---------- - values : array of str, int, or float, list of arrays, dict, or ~riptable.rt_categorical.Categorical or pandas.Categorical + values : array of str, int, or float, list of arrays, dict, or :py:class:`~.rt_categorical.Categorical` or :py:class:`pandas.Categorical` - Strings: Unicode strings and byte strings are supported. - Integers without provided categories: The integer mapping codes start at 1. - Integers with provided categories: If you have an array of integers that indexes into an array of provided unique categories, the integers are used - for the integer mapping array. Any 0 values are mapped to the Filtered category. + for the integer mapping array. Any ``0`` values are mapped to the ``Filtered`` category. - Floats are supported with no user-provided categories. If you have a Matlab - Categorical with categories, set ``from_matlab`` to `True`. `Categorical` objects - created from Matlab Categoricals must have a base-1 index; any 0.0 values - become Filtered. + Categorical with categories, set ``from_matlab`` to `True`. + :py:class:`~.rt_categorical.Categorical` objects created from Matlab Categoricals + must have a base-1 index; any ``0.0`` values become ``Filtered``. - A list of arrays or a dictionary with multiple key-value pairs creates a - multi-key `Categorical`. - - For a `Categorical` created from a `Categorical`, a deep copy of categories + multi-key :py:class:`~.rt_categorical.Categorical`. + - For a :py:class:`~.rt_categorical.Categorical` created from a + :py:class:`~.rt_categorical.Categorical`, a deep copy of categories is performed. - - For a `Categorical` created from a Pandas Categorical, a deep copy is - performed and indices start at 1 to preserve invalid values. - `Categorical` objects created from Pandas Catagoricals must have a base-1 index. + - For a :py:class:`~.rt_categorical.Categorical` created from a Pandas + Categorical, a deep copy is performed and indices start at 1 to preserve + invalid values. :py:class:`~.rt_categorical.Categorical` objects created from + Pandas Catagoricals must have a base-1 index. categories : array of str, int, or float, dict of {str : int} or {int : str}, or IntEnum, optional The unique categories. Can be: @@ -1410,8 +1417,9 @@ class Categorical(GroupByOps, FastArray): Note: - User-provided categories are always held in the order provided. - - Multi-key `Categorical` objects don't support user-provided categories. - ordered : bool, default None/True + - Multi-key :py:class:`~.rt_categorical.Categorical` objects don't support + user-provided categories. + ordered : bool, default `None`/`True` Controls whether categories are sorted lexicographically before they are mapped to integers: - If categories are not provided, by default they are sorted. If @@ -1419,108 +1427,120 @@ class Categorical(GroupByOps, FastArray): sort categories for groupby operations, use ``sort_gb=True`` (see below). - If categories are provided, they are always held in the order they're provided in; they can't be sorted with ``ordered`` or ``lex``. - sort_gb : bool, default None/False + sort_gb : bool, default `None`/`False` Controls whether groupby operation results are displayed in sorted order. Note that results may already appear sorted based on ``ordered`` or ``lex`` settings. sort_display : bool, optional See ``sort_gb``. - lex : bool, default None/False + lex : bool, default `None`/`False` Controls whether hashing- or sorting-based logic is used to find unique values in the input array. By default hashing is used. If more than 50% of the values are unique, set ``lex=True`` for a possibly faster lexicographical sort (not supported if categories are provided). - base_index : {None, 0, 1}, default None/1 + base_index : {None, 0, 1}, default `None`/1 By default, base-1 indexing is used. Base-0 can be used if: - - A mapping dictionary isn't used. A `Categorical` created from a mapping - dictionary does not have a base index. + - A mapping dictionary isn't used. A :py:class:`~.rt_categorical.Categorical` + created from a mapping dictionary does not have a base index. - A ``filter`` isn't used at creation. - - A Matlab or Pandas Categorical isn't being converted. These both reserve 0 + - A Matlab or Pandas Categorical isn't being converted. These both reserve ``0`` for invalid values. - If base-0 indexing is used, 0 becomes a valid category. + If base-0 indexing is used, ``0`` becomes a valid category. filter : array of bool, optional - Must be the same length as ``values``. Values that are `False` become Filtered - and mapped to 0 in the integer mapping array, and they are ignored in groupby - operations. A filter can't be used with a base-0 `Categorical` or one created - with a mapping dictionary or :py:class:`~enum.IntEnum`. + Must be the same length as ``values``. Values that are `False` become ``Filtered`` + and mapped to ``0`` in the integer mapping array, and they are ignored in groupby + operations. A filter can't be used with a base-0 + :py:class:`~.rt_categorical.Categorical` or one created with a mapping + dictionary or :py:class:`~enum.IntEnum`. dtype : riptable.dtype, numpy.dtype, or str, optional Force the dtype of the underlying integer mapping array. Must be a signed integer dtype. By default, the constructor uses the smallest dtype based on the number of unique categories or the maximum value provided in a mapping. - unicode : bool, default False + unicode : bool, default `False` By default, the array of unique categories is stored as byte strings. Set to `True` to store as unicode strings. invalid : str, optional Specify a value in ``values`` to be treated as an invalid category. Note: Invalid - categories are not excluded from aggregations; use `filter` instead. Warning: If the + categories are not excluded from aggregations; use ``filter`` instead. Warning: If the invalid category isn't included in ``categories`` and a ``filter`` is used, the - invalid category becomes Filtered. - auto_add : bool, default False + invalid category becomes ``Filtered``. + auto_add : bool, default `False` Warning: Until a known issue is fixed, adding categories can have unexpected results. Intended behavior: When set to `True`, categories that do not exist in the unique - categories can be added using `~Categorical.category_add`. - from_matlab : bool, default False + categories can be added using :py:meth:`~.rt_categorical.Categorical.category_add`. + from_matlab : bool, default `False` Set to `True` to convert a Matlab Categorical. The float indices are converted to an integer type. To preserve invalid values, only base-1 indexing is supported. + _from_categorical : bool, default `None` + Internal parameter. See Also -------- - ~riptable.Accum2 : Class for multi-key aggregations with summary data displayed. - Categorical._fa : + :py:class:`.rt_accum2.Accum2` : + Class for multi-key aggregations with summary data displayed. + :py:meth:`.rt_categorical.Categorical._fa` : Return the array of integer category mapping codes that corresponds to the - array of `Categorical` values. - Categorical.category_array : Return the array of unique categories of a `Categorical`. - Categorical.category_dict : Return a dictionary of the unique categories. - Categorical.category_mapping : - Return a dictionary of the integer category mapping codes for a `Categorical` - created with an :py:class:`~enum.IntEnum` or a mapping dictionary. - Categorical.base_index : See the base index of a `Categorical`. - Categorical.isnan : See which `Categorical` category is invalid. + array of :py:class:`~.rt_categorical.Categorical` values. + :py:meth:`.rt_categorical.Categorical.category_array` : + Return the array of unique categories of a :py:class:`~.rt_categorical.Categorical`. + :py:meth:`.rt_categorical.Categorical.category_dict` : + Return a dictionary of the unique categories. + :py:meth:`.rt_categorical.Categorical.category_mapping` : + Return a dictionary of the integer category mapping codes for a + :py:class:`~.rt_categorical.Categorical` created with an :py:class:`~enum.IntEnum` + or a mapping dictionary. + :py:meth:`.rt_categorical.Categorical.base_index` : + See the base index of a :py:class:`~.rt_categorical.Categorical`. + :py:meth:`.rt_categorical.Categorical.isnan` : + See which :py:class:`~.rt_categorical.Categorical` category is invalid. Examples -------- - A single-key `Categorical` created from a list of strings: + A single-key :py:class:`~.rt_categorical.Categorical` created from a list of strings: >>> c = rt.Categorical(["b", "a", "b", "a", "c", "c", "b"]) + >>> c Categorical([b, a, b, a, c, c, b]) Length: 7 FastArray([2, 1, 2, 1, 3, 3, 2], dtype=int8) Base Index: 1 FastArray([b'a', b'b', b'c'], dtype='|S1') Unique count: 3 - A `Categorical` created from list of non-unique string values and a list of unique - category strings. All values must appear in the provided categories, otherwise an - error is raised: + A :py:class:`~.rt_categorical.Categorical` created from list of non-unique string + values and a list of unique category strings. All values must appear in the provided + categories, otherwise an error is raised: >>> rt.Categorical(["b", "a", "b", "c", "a", "c", "c", "c"], categories=["b", "a", "c"]) Categorical([b, a, b, c, a, c, c, c]) Length: 8 FastArray([1, 2, 1, 3, 2, 3, 3, 3], dtype=int8) Base Index: 1 FastArray([b'b', b'a', b'c'], dtype='|S1') Unique count: 3 - A `Categorical` created from a list of integers that index into a list of unique - strings. The integers are used for the mapping array. Note that 0 becomes Filtered: + A :py:class:`~.rt_categorical.Categorical` created from a list of integers that + index into a list of unique strings. The integers are used for the mapping array. + Note that ``0`` becomes ``Filtered``: >>> rt.Categorical([0, 1, 1, 0, 2, 1, 2], categories=["c", "a", "b"]) Categorical([Filtered, c, c, Filtered, a, c, a]) Length: 7 FastArray([0, 1, 1, 0, 2, 1, 2]) Base Index: 1 FastArray([b'c', b'a', b'b'], dtype='|S1') Unique count: 3 - If integers are provided with no categories and 0 is included, the integer mapping - codes are incremented by 1 so that 0 is not Filtered: + If integers are provided with no categories and ``0`` is included, the integer mapping + codes are incremented by 1 so that ``0`` is not filtered: >>> rt.Categorical([0, 1, 1, 0, 2, 1, 2]) Categorical([0, 1, 1, 0, 2, 1, 2]) Length: 7 FastArray([1, 2, 2, 1, 3, 2, 3], dtype=int8) Base Index: 1 FastArray([0, 1, 2]) Unique count: 3 - Use ``from_matlab=True`` to create a `Categorical` from Matlab data. The float - indices are converted to an integer type. To preserve invalid values, only base-1 - indexing is supported: + Use ``from_matlab=True`` to create a :py:class:`~.rt_categorical.Categorical` from + Matlab data. The float indices are converted to an integer type. To preserve invalid + values, only base-1 indexing is supported: >>> rt.Categorical([0.0, 1.0, 2.0, 3.0, 1.0, 1.0], categories=["b", "c", "a"], from_matlab=True) Categorical([Filtered, b, c, a, b, b]) Length: 6 FastArray([0, 1, 2, 3, 1, 1], dtype=int8) Base Index: 1 FastArray([b'b', b'c', b'a'], dtype='|S1') Unique count: 3 - A `Categorical` created from a Pandas Categorical with an invalid value: + A :py:class:`~.rt_categorical.Categorical` created from a Pandas Categorical with an + invalid value: >>> import pandas as pd >>> pdc = pd.Categorical(["a", "a", "z", "b", "c"], ["c", "b", "a"]) @@ -1532,9 +1552,9 @@ class Categorical(GroupByOps, FastArray): FastArray([3, 3, 0, 2, 1], dtype=int8) Base Index: 1 FastArray([b'c', b'b', b'a'], dtype='|S1') Unique count: 3 - A `Categorical` created from a Python dictionary of strings to integers. The dictionary - is provided as the ``categories`` argument, with a list of the mapping codes provided - as the first argument: + A :py:class:`~.rt_categorical.Categorical` created from a Python dictionary of + strings to integers. The dictionary is provided as the ``categories`` argument, with + a list of the mapping codes provided as the first argument: >>> d = {"StronglyAgree": 44, "Agree": 133, "Disagree": 75, "StronglyDisagree": 1, "NeitherAgreeNorDisagree": 144 } >>> codes = [1, 44, 44, 133, 75] @@ -1543,7 +1563,8 @@ class Categorical(GroupByOps, FastArray): FastArray([ 1, 44, 44, 133, 75]) Base Index: None {44:'StronglyAgree', 133:'Agree', 75:'Disagree', 1:'StronglyDisagree', 144:'NeitherAgreeNorDisagree'} Unique count: 4 - A `Categorical` created using the categories of another `Categorical`: + A :py:class:`~.rt_categorical.Categorical` created using the categories of another + :py:class:`~.rt_categorical.Categorical`: >>> c = rt.Categorical(["a", "a", "b", "a", "c", "c", "b"], categories=["c", "b", "a"]) >>> c.category_array @@ -1554,8 +1575,8 @@ class Categorical(GroupByOps, FastArray): FastArray([2, 1, 1, 2], dtype=int8) Base Index: 1 FastArray([b'c', b'b', b'a'], dtype='|S1') Unique count: 3 - Multi-key Categoricals let you create and operate on groupings based on multiple - associated categories: + Multi-key :py:class:`~.rt_categorical.Categorical` objects let you create and + operate on groupings based on multiple associated categories: >>> strs = rt.FastArray(["a", "b", "b", "a", "b", "a"]) >>> ints = rt.FastArray([2, 1, 1, 2, 1, 3]) @@ -1571,7 +1592,7 @@ class Categorical(GroupByOps, FastArray): b 1 3 a 3 1 - [3 rows x 3 columns] total bytes: 27.0 B + [3 rows x 3 columns] total bytes: 39.0 B """ # current metadata version and default values necessary for final reconstruction @@ -2135,23 +2156,26 @@ def _nanfunc(self, func, fillval): # ------------------------------------------------------------ def isnan(self) -> FastArray: """ - Find the invalid elements of a `Categorical`. + Find the invalid elements of a :py:class:`~.rt_categorical.Categorical`. - An invalid category is specified when the `Categorical` is created or set - afterward using `Categorical.invalid_set`. An invalid category is different - from a Filtered category or a NaN value. + An invalid category is specified when the :py:class:`~.rt_categorical.Categorical` + is created or set afterward using :py:meth:`~.rt_categorical.Categorical.invalid_set`. + An invalid category is different from a filtered category or a NaN value. Returns ------- - FastArray + :py:class:`~.rt_fastarray.FastArray` A boolean array the length of the values array where `True` indicates - an invalid `Categorical` category. + an invalid :py:class:`~.rt_categorical.Categorical` category. See Also -------- - Categorical.isnotnan : Find the valid elements of a `Categorical.` - Categorical.invalid_category : The `Categorical` object's invalid category. - Categorical.invalid_set : Set a `Categorical` category to be invalid. + :py:meth:`.rt_categorical.Categorical.isnotnan` : + Find the valid elements of a :py:class:`~.rt_categorical.Categorical`. + :py:meth:`.rt_categorical.Categorical.invalid_category` : The + :py:class:`~.rt_categorical.Categorical` object's invalid category. + :py:meth:`.rt_categorical.Categorical.invalid_set` : + Set a :py:class:`~.rt_categorical.Categorical` category to be invalid. Examples -------- @@ -2163,7 +2187,7 @@ def isnan(self) -> FastArray: >>> c.isnan() FastArray([ True, False, False, True, False]) - Invalid categories are different from Filtered categories: + Invalid categories are different from filtered categories: >>> f = rt.FA([True, False, True, True, True]) >>> c2 = rt.Categorical(values=["b", "a", "c", "b", "c"], invalid="b", filter=f) @@ -2174,21 +2198,23 @@ def isnan(self) -> FastArray: >>> c2.isnan() # Only the invalid category returns True for Cat.isnan. FastArray([ True, False, False, True, False]) >>> c2.isfiltered() # Only the Filtered value returns True for Cat.isfiltered. - FastArray([False, True, False, False, False]) + FastArray([False, True, False, False, False]) - Invalid categories in a `Categorical` are different from regular integer NaN - values. An integer NaN is a valid category and is `False` for ``Cat.isnan()``: + Invalid categories in a :py:class:`~.rt_categorical.Categorical` are different + from regular integer NaN values. An integer NaN is a valid category and is + `False` for :py:meth:`~.rt_categorical.Categorical.isnan`: >>> a = rt.FA([1, 2, 3, 4]) >>> a[3] = a.inv # Set the last value to an integer NaN. >>> a - FastArray([ 1, 2, 3, -2147483648]) + FastArray([ 1, 2, + 3, -9223372036854775808]) >>> c3 = rt.Categorical(values=a, invalid=2) # Make 2 an invalid category. >>> c3 - Categorical([1, 2, 3, -2147483648]) Length: 4 + Categorical([1, 2, 3, -9223372036854775808]) Length: 4 FastArray([2, 3, 4, 1], dtype=int8) Base Index: 1 - FastArray([-2147483648, 1, 2, 3]) Unique count: 4 - >>> c3.invalid_category() + FastArray([-9223372036854775808, 1, 2, 3]) Unique count: 4 + >>> c3.invalid_category 2 >>> c3.isnan() # Only the invalid category returns True for Cat.isnan. FastArray([False, True, False, False]) @@ -2200,11 +2226,11 @@ def isnan(self) -> FastArray: # ------------------------------------------------------------ def isnotnan(self) -> FastArray: """ - Find the valid elements of a `Categorical.` + Find the valid elements of a :py:class:`~.rt_categorical.Categorical.` - An invalid category is specified when the `Categorical` is created or set - afterward using `Categorical.invalid_set`. An invalid category is different - from a Filtered category or a NaN value. + An invalid category is specified when the :py:class:`~.rt_categorical.Categorical` + is created or set afterward using :py:meth:`~.rt_categorical.Categorical.invalid_set`. + An invalid category is different from a filtered category or a NaN value. Returns ------- @@ -2228,7 +2254,7 @@ def isnotnan(self) -> FastArray: >>> c.isnotnan() FastArray([False, True, True, False, True]) - Invalid categories are different from Filtered categories: + Invalid categories are different from filtered categories: >>> f = rt.FA([True, False, True, True, True]) >>> c2 = rt.Categorical(values=["b", "a", "c", "b", "c"], invalid="b", filter=f) @@ -2291,35 +2317,33 @@ def fill_forward(self, *args, limit: int = 0, fill_val=None, inplace: bool = Fal limit : int, default 0 (disabled) The maximium number of consecutive NaN or invalid values to fill. If there is a gap with more than this number of consecutive NaN or invalid values, - the gap will be only partially filled. If no `limit` is specified, all - consecutive NaN and invalid values are replaced. - fill_val : scalar, default None + the gap is only partially filled. If no ``limit`` is specified or a value + of ``0`` is specified, all consecutive NaN and invalid values are replaced. + fill_val : scalar, default `None` The value to use where there is no valid group value to propagate forward. - If `fill_val` is not specified, NaN and invalid values aren't replaced where + If ``fill_val`` is not specified, NaN and invalid values aren't replaced where there is no valid group value to propagate forward. - inplace: bool, default False - If False, return a copy of the array. If True, modify original data. This - will modify any other views on this object. This fails if the array is + inplace : bool, default `False` + If `False`, return a copy of the array. If `True`, modify original data. This + modifies any other views on this object. This fails if the array is locked. Returns ------- - `Categorical` - The `Categorical` will be the same size and have the same dtypes as the - original input. + :py:class:`~.rt_categorical.Categorical` + The :py:class:`~.rt_categorical.Categorical` is the same size and has the + same dtypes as the original input. See Also -------- - Categorical.fill_backward : + :py:meth:`.rt_categorical.Categorical.fill_backward` : Replace NaN and invalid array values with the next valid group value. - GroupBy.fill_forward : - Replace NaN and invalid array values with the last valid group value. - riptable.fill_forward : Replace NaN and invalid values with the last valid - value. - Dataset.fillna : Replace NaN and invalid values with a specified value or - nearby data. - FastArray.fillna : Replace NaN and invalid values with a specified value or - nearby data. + :py:func:`.rt_fastarraynumba.fill_forward` : + Replace NaN and invalid values with the last valid value. + :py:meth:`.rt_dataset.Dataset.fillna` : + Replace NaN and invalid values with a specified value or nearby data. + :py:meth:`.rt_fastarray.FastArray.fillna` : + Replace NaN and invalid values with a specified value or nearby data. Examples -------- @@ -2334,8 +2358,10 @@ def fill_forward(self, *args, limit: int = 0, fill_val=None, inplace: bool = Fal B 3.00 A 2.00 B 3.00 + + [6 rows x 2 columns] total bytes: 56.0 B - Use a `fill_val` to replace values where there's no valid group value to + Use a ``fill_val`` to replace values where there's no valid group value to propagate forward: >>> x = rt.FastArray([rt.nan, rt.nan, 2, 3, 4, 5]) @@ -2368,35 +2394,33 @@ def fill_backward(self, *args, limit: int = 0, fill_val=None, inplace: bool = Fa limit : int, default 0 (disabled) The maximium number of consecutive NaN or invalid values to fill. If there is a gap with more than this number of consecutive NaN or invalid values, - the gap will be only partially filled. If no `limit` is specified, all - consecutive NaN and invalid values are replaced. - fill_val : scalar, default None + the gap is only partially filled. If no ``limit`` is specified or a value of + ``0`` is specified, all consecutive NaN and invalid values are replaced. + fill_val : scalar, default `None` The value to use where there is no valid group value to propagate backward. - If `fill_val` is not specified, NaN and invalid values aren't replaced where + If ``fill_val`` is not specified, NaN and invalid values aren't replaced where there is no valid group value to propagate backward. - inplace: bool, default False - If False, return a copy of the array. If True, modify original data. This - will modify any other views on this object. This fails if the array is + inplace : bool, default `False` + If `False`, return a copy of the array. If `True`, modify original data. This + modifies any other views on this object. This fails if the array is locked. Returns ------- - `Categorical` - The `Categorical` will be the same size and have the same dtypes as the - original input. + :py:class:`~.rt_categorical.Categorical` + The :py:class:`~.rt_categorical.Categorical` is the same size and has the + same dtypes as the original input. See Also -------- - Categorical.fill_forward : + :py:meth:`.rt_categorical.Categorical.fill_forward` : Replace NaN and invalid array values with the last valid group value. - GroupBy.fill_backward : - Replace NaN and invalid array values with the next valid group value. - riptable.fill_backward : Replace NaN and invalid values with the next valid - value. - Dataset.fillna : Replace NaN and invalid values with a specified value or - nearby data. - FastArray.fillna : Replace NaN and invalid values with a specified value or - nearby data. + :py:func:`.rt_fastarraynumba.fill_backward` : Replace NaN and invalid values + with the next valid value. + :py:meth:`.rt_dataset.Dataset.fillna` : Replace NaN and invalid values with a + specified value or nearby data. + :py:meth:`.rt_fastarray.FastArray.fillna` : Replace NaN and invalid values with + a specified value or nearby data. Examples -------- @@ -2411,8 +2435,10 @@ def fill_backward(self, *args, limit: int = 0, fill_val=None, inplace: bool = Fa B 3.00 A 4.00 B 5.00 + + [6 rows x 2 columns] total bytes: 56.0 B - Use a `fill_val` to replace values where there's no valid group value to + Use a ``fill_val`` to replace values where there's no valid group value to propagate backward: >>> x = rt.FastArray([0, 1, 2, 3, rt.nan, rt.nan]) @@ -2467,25 +2493,34 @@ def set_name(self, name) -> Categorical: @property def _fa(self) -> FastArray: """ - Return the array of integer category mapping codes that corresponds to the array of `Categorical` values. + Return the array of integer category mapping codes that corresponds to the array + of :py:class:`~.rt_categorical.Categorical` values. Returns ------- - FastArray - A `.FastArray` of the integer category mapping codes of the `Categorical`. + :py:class:`~.rt_fastarray.FastArray` + A :py:class:`~.rt_fastarray.FastArray` of the integer category mapping codes + of the :py:class:`~.rt_categorical.Categorical`. See Also -------- - Categorical.category_array : Return the array of unique categories of a `Categorical`. - Categorical.categories : - Return the unique categories of a single-key or multi-key `Categorical`, prepended with the 'Filtered' category. - Categorical.category_dict : Return a dictionary of the unique categories. - Categorical.category_mapping : - Return a dictionary of the integer category mapping codes for a `Categorical` created with an :py:class:`~enum.IntEnum` or a mapping dictionary. + :py:meth:`.rt_categorical.Categorical.category_array` : + Return the array of unique categories of a + :py:class:`~.rt_categorical.Categorical`. + :py:meth:`.rt_categorical.Categorical.categories` : + Return the unique categories of a single-key or multi-key + :py:class:`~.rt_categorical.Categorical`, prepended with the 'Filtered' + category. + :py:meth:`.rt_categorical.Categorical.category_dict` : + Return a dictionary of the unique categories. + :py:meth:`.rt_categorical.Categorical.category_mapping` : + Return a dictionary of the integer category mapping codes for a + :py:class:`~.rt_categorical.Categorical` created with an + :py:class:`~enum.IntEnum` or a mapping dictionary. Examples -------- - Single-key string `Categorical`: + Single-key string :py:class:`~.rt_categorical.Categorical`: >>> c = rt.Categorical(['a','a','b','c','a']) >>> c @@ -2495,7 +2530,7 @@ def _fa(self) -> FastArray: >>> c._fa FastArray([1, 1, 2, 3, 1], dtype=int8) - Multi-key `Categorical`: + Multi-key :py:class:`~.rt_categorical.Categorical`: >>> c2 = rt.Categorical([rt.FA([1, 2, 3, 3, 3, 1]), rt.FA(['a','b','c','c','c','a'])]) >>> c2 @@ -2505,7 +2540,9 @@ def _fa(self) -> FastArray: >>> c2._fa FastArray([1, 2, 3, 3, 3, 1], dtype=int8) - A `Categorical` constructed with an :py:class:`~enum.IntEnum` or a mapping dictionary returns the provided integer category mapping codes: + A :py:class:`~.rt_categorical.Categorical` constructed with an + :py:class:`~enum.IntEnum` or a mapping dictionary returns the provided integer + category mapping codes: >>> log_levels = {10: "DEBUG", 20: "INFO", 30: "WARNING", 40: "ERROR", 50: "CRITICAL"} >>> c3 = rt.Categorical([10, 10, 40, 0, 50, 10, 30], log_levels) @@ -2637,30 +2674,41 @@ def _categories(self): @property def category_array(self) -> FastArray: """ - Return the array of unique categories of a `Categorical`. + Return the array of unique categories of a + :py:class:`~.rt_categorical.Categorical`. - Unlike `Categorical.categories`, this method does not prepend the 'Filtered' category to the returned array. + Unlike :py:meth:`~.rt_categorical.Categorical.categories`, this method does not + prepend the 'Filtered' category to the returned array. - Raises an error for multi-key `Categorical` objects. To get the categories of a multi-key `Categorical`, use `Categorical.categories`. + Raises an error for multi-key :py:class:`~.rt_categorical.Categorical` objects. + To get the categories of a multi-key :py:class:`~.rt_categorical.Categorical`, + use :py:meth:`~.rt_categorical.Categorical.categories`. Returns ------- - FastArray - A `.FastArray` of the unique categories of the `Categorical`. + :py:class:`~.rt_fastarray.FastArray` + A :py:class:`~.rt_fastarray.FastArray` of the unique categories of the + :py:class:`~.rt_categorical.Categorical`. See Also -------- - Categorical._fa : - Return the array of integer category mapping codes that corresponds to the array of `Categorical` values. - Categorical.categories : - Return the unique categories of a single-key or multi-key `Categorical`, prepended with the 'Filtered' category. - Categorical.category_dict : Return a dictionary of the unique categories. - Categorical.category_mapping : - Return a dictionary of the integer category mapping codes for a `Categorical` created with an :py:class:`~enum.IntEnum` or a mapping dictionary. + :py:meth:`.rt_categorical.Categorical._fa` : + Return the array of integer category mapping codes that corresponds to the + array of :py:class:`~.rt_categorical.Categorical` values. + :py:meth:`.rt_categorical.Categorical.categories` : + Return the unique categories of a single-key or multi-key + :py:class:`~.rt_categorical.Categorical`, prepended with the 'Filtered' + category. + :py:meth:`.rt_categorical.Categorical.category_dict` : + Return a dictionary of the unique categories. + :py:meth:`.rt_categorical.Categorical.category_mapping` : + Return a dictionary of the integer category mapping codes for a + :py:class:`~.rt_categorical.Categorical` created with an + :py:class:`~enum.IntEnum` or a mapping dictionary. Examples -------- - Single-key string `Categorical`: + Single-key string :py:class:`~.rt_categorical.Categorical`: >>> c = rt.Categorical(['a','a','b','c','a']) >>> c @@ -2670,7 +2718,7 @@ def category_array(self) -> FastArray: >>> c.category_array FastArray([b'a', b'b', b'c'], dtype='|S1') - Single-key integer `Categorical`: + Single-key integer :py:class:`~.rt_categorical.Categorical`: >>> c2 = rt.Categorical([4, 5, 4, 4, 6, 5, 6]) >>> c2 @@ -2680,7 +2728,8 @@ def category_array(self) -> FastArray: >>> c2.category_array FastArray([4, 5, 6]) - Single-key integer `Categorical` with categories provided: + Single-key integer :py:class:`~.rt_categorical.Categorical` with categories + provided: >>> c3 = rt.Categorical([2, 3, 4, 2, 3, 4], categories=['a', 'b', 'c', 'd', 'e']) >>> c3 @@ -2700,7 +2749,9 @@ def category_array(self) -> FastArray: >>> c4.category_array FastArray([b'a', b'b', b'c'], dtype='|S1') - A `Categorical` constructed with an :py:class:`~enum.IntEnum` or a mapping dictionary returns the provided string categories: + A :py:class:`~.rt_categorical.Categorical` constructed with an + :py:class:`~enum.IntEnum` or a mapping dictionary returns the provided string + categories: >>> log_levels = {10: "DEBUG", 20: "INFO", 30: "WARNING", 40: "ERROR", 50: "CRITICAL"} >>> c5 = rt.Categorical([10, 10, 40, 0, 50, 10, 30], log_levels) @@ -3310,93 +3361,97 @@ def shift( The number of periods to shift. Can be a negative number to shift values backward. periods : int, optional, default 1 - Can use `periods` instead of `window` for Pandas parameter + Can use ``periods`` instead of ``window`` for Pandas parameter support. - filter : FastArray of bool, optional - Set of rows to include. Filtered out rows are skipped by the shift and become NaN in the output. + filter : :py:class:`~.rt_fastarray.FastArray` of bool, optional + Set of rows to include. Filtered out rows are skipped by the shift and + become NaN in the output. Returns ------- - Dataset - A `.Dataset` containing a column of shifted values. + :py:class:`~.rt_dataset.Dataset` + A :py:class:`~.rt_dataset.Dataset` containing a column of shifted values. See Also -------- - Categorical.shift_cat : Shift the values of a `Categorical`. - .FastArray.shift : Shift the values of a `.FastArray`. - .DateTimeNano.shift : Shift the values of a `.DateTimeNano` array. + :py:meth:`.rt_categorical.Categorical.shift_cat` : + Shift the values of a :py:class:`~.rt_categorical.Categorical`. + :py:meth:`.rt_fastarray.FastArray.shift` : Shift the values of a + :py:class:`~.rt_fastarray.FastArray`. + :py:meth:`.rt_datetime.DateTimeNano.shift` : + Shift the values of a :py:class:`~.rt_datetime.DateTimeNano` array. Examples -------- - With the default `window=1`: + With the default ``window=1``: >>> c = rt.Cat(['a', 'a', 'a', 'b', 'b', 'b', 'c', 'c', 'c']) >>> fa = rt.arange(9) >>> shift_val = c.shift(fa) >>> shift_val - # col_0 - - ----- - 0 Inv - 1 0 - 2 1 - 3 Inv - 4 3 - 5 4 - 6 Inv - 7 6 - 8 7 + # col_0 + --- ----- + 0 Inv + 1 0 + 2 1 + ... ... + 6 Inv + 7 6 + 8 7 + + [9 rows x 1 columns] total bytes: 72.0 B With ``window=2``: >>> shift_val_2 = c.shift(fa, window=2) >>> shift_val_2 - # col_0 - - ----- - 0 Inv - 1 Inv - 2 0 - 3 Inv - 4 Inv - 5 3 - 6 Inv - 7 Inv - 8 6 + # col_0 + --- ----- + 0 Inv + 1 Inv + 2 0 + ... ... + 6 Inv + 7 Inv + 8 6 + + [9 rows x 1 columns] total bytes: 72.0 B With ``window=-1``: >>> shift_neg = c.shift(fa, window=-1) >>> shift_neg - # col_0 - - ----- - 0 1 - 1 2 - 2 Inv - 3 4 - 4 5 - 5 Inv - 6 7 - 7 8 - 8 Inv - - With `filter`: + # col_0 + --- ----- + 0 1 + 1 2 + 2 Inv + ... ... + 6 7 + 7 8 + 8 Inv + + [9 rows x 1 columns] total bytes: 72.0 B + + With ``filter``: >>> filt = rt.FA([True, True, True, True, False, True, False, True, True]) >>> shift_filt = c.shift(fa, filter=filt) >>> shift_filt - # col_0 - - ----- - 0 Inv - 1 0 - 2 1 - 3 Inv - 4 Inv - 5 3 - 6 Inv - 7 Inv - 8 7 - - Results put in a `.Dataset` to show the shifts in relation to the - categories: + # col_0 + --- ----- + 0 Inv + 1 0 + 2 1 + ... ... + 6 Inv + 7 Inv + 8 7 + + [9 rows x 1 columns] total bytes: 72.0 B + + Results put in a :py:class:`~.rt_dataset.Dataset` to show the shifts in relation + to the categories: >>> ds = rt.Dataset() >>> ds.c = c @@ -3404,34 +3459,34 @@ def shift( >>> ds.shift_val_2 = shift_val_2 >>> ds.shift_neg = shift_neg >>> ds - # c shift_val shift_val_2 shift_neg - - - --------- ----------- --------- - 0 a Inv Inv 1 - 1 a 0 Inv 2 - 2 a 1 0 Inv - 3 b Inv Inv 4 - 4 b 3 Inv 5 - 5 b 4 3 Inv - 6 c Inv Inv 7 - 7 c 6 Inv 8 - 8 c 7 6 Inv + # c shift_val shift_val_2 shift_neg + --- --- --------- ----------- --------- + 0 a Inv Inv 1 + 1 a 0 Inv 2 + 2 a 1 0 Inv + ... ... ... ... ... + 6 c Inv Inv 7 + 7 c 6 Inv 8 + 8 c 7 6 Inv + + [9 rows x 4 columns] total bytes: 228.0 B Shift two arrays: >>> fa2 = rt.arange(10, 19) >>> shift_val_3 = c.shift([fa, fa2]) >>> shift_val_3 - # col_0 col_1 - - ----- ----- - 0 Inv Inv - 1 0 10 - 2 1 11 - 3 Inv Inv - 4 3 13 - 5 4 14 - 6 Inv Inv - 7 6 16 - 8 7 17 + # col_0 col_1 + --- ----- ----- + 0 Inv Inv + 1 0 10 + 2 1 11 + ... ... ... + 6 Inv Inv + 7 6 16 + 8 7 17 + + [9 rows x 2 columns] total bytes: 144.0 B """ # support for pandas periods keyword # only one of window and period may be specified @@ -5095,14 +5150,14 @@ def count(self, filter: Optional[np.ndarray] = None, transform: bool = False) -> :py:class:`~.rt_categorical.Categorical` values that correspond to `False` filter values are excluded from the count. The filter array must be the same length as the :py:class:`~.rt_categorical.Categorical`. - transform : bool, default False + transform : bool, default `False` Set to `True` to return a :py:class:`~.rt_dataset.Dataset` that's the length of the :py:class:`~.rt_categorical.Categorical`, with counts aligned to the ungrouped :py:class:`~.rt_categorical.Categorical` values. Only the counts are included. Returns ------- - :py:class:`.rt_dataset.Dataset` + :py:class:`~.rt_dataset.Dataset` A :py:class:`~.rt_dataset.Dataset` containing each unique category and its count. If ``transform`` is `True`, the :py:class:`~.rt_dataset.Dataset` is the same length as the original :py:class:`~.rt_categorical.Categorical` and contains only the counts. @@ -5275,9 +5330,9 @@ def as_string_array(self) -> FastArray: a string array. For multi-key :py:class:`~.rt_categorical.Categorical` objects, the corresponding - keys are concatenated with a "_" separator. + keys are concatenated with a ``_`` separator. - Filtered values become the string "Filtered". Values from invalid + Filtered values become the string ``Filtered``. Values from invalid categories are treated the same way as values from valid categories. NOTE: This routine is costly because it re-expands the full list of @@ -5285,7 +5340,7 @@ def as_string_array(self) -> FastArray: Returns ------- - :py:class:`rt_fastarray.FastArray` + :py:class:`~.rt_fastarray.FastArray` A :py:class:`~.rt_fastarray.FastArray` of the string values of the :py:class:`~.rt_categorical.Categorical`. @@ -5943,51 +5998,55 @@ def expand_any(self, categories): @_use_autocomplete_placeholder(placeholder=lambda self: self._fa) def expand_array(self) -> Union[np.ndarray, Tuple[np.ndarray, ...]]: """ - Return the full list of values of a `Categorical`. + Return the full list of values of a :py:class:`~.rt_categorical.Categorical`. - If the `Categorical` is constructed with an :py:class:`~enum.IntEnum` or a mapping - dictionary, the integer mapping codes are returned. + If the :py:class:`~.rt_categorical.Categorical` is constructed with an + :py:class:`~enum.IntEnum` or a mapping dictionary, the integer mapping codes are + returned. - Filtered `Categorical` values are returned as "Filtered" for string - arrays or numeric sentinel values for numeric arrays. + Filtered :py:class:`~.rt_categorical.Categorical` values are returned as + ``Filtered`` for string arrays or numeric sentinel values for numeric arrays. Note that because the expansion constructs the complete list of values from the list of unique categories, it is an expensive operation. Returns ------- - FastArray or tuple of FastArray - For single-key `Categorical` objects, a `FastArray` is returned. For - multi-key `Categorical` objects, a tuple of `FastArray` objects is - returned. + :py:class:`~.rt_fastarray.FastArray` or tuple of :py:class:`~.rt_fastarray.FastArray` + For single-key :py:class:`~.rt_categorical.Categorical` objects, a + :py:class:`~.rt_fastarray.FastArray` is returned. For multi-key + :py:class:`~.rt_categorical.Categorical` objects, a tuple of + :py:class:`~.rt_fastarray.FastArray` objects is returned. Warns ----- Performance warning - Will warn the user if a large `Categorical` (more than 100,000 items) - is being re-expanded. + Warns the user if a large :py:class:`~.rt_categorical.Categorical` (more + than 100,000 items) is being re-expanded. See Also -------- - Categorical.as_string_array : - Return the full list of values of a `Categorical` as a string array. + :py:meth:`.rt_categorical.Categorical.as_string_array` : + Return the full list of values of a :py:class:`~.rt_categorical.Categorical` + as a string array. Examples -------- - Single-key `Categorical`: + Single-key :py:class:`~.rt_categorical.Categorical`: >>> c = rt.Categorical(["a", "a", "b", "c", "a"]) >>> c.expand_array - FastArray([b'a', b'a', b'b', b'c', b'a'], dtype='|S3') + FastArray([b'a', b'a', b'b', b'c', b'a'], dtype='|S8') - Multi-key `Categorical`: + Multi-key :py:class:`~.rt_categorical.Categorical`: >>> c = rt.Categorical([rt.FastArray(["a", "b", "c", "a"]), rt.FastArray([1, 2, 3, 1])]) >>> c.expand_array (FastArray([b'a', b'b', b'c', b'a'], dtype='|S8'), FastArray([1, 2, 3, 1])) - For a `Categorical` constructed with an :py:class:`~enum.IntEnum` or a mapping dictionary, - the array of integer mapping codes (``c._fa``) is returned: + For a :py:class:`~.rt_categorical.Categorical` constructed with an + :py:class:`~enum.IntEnum` or a mapping dictionary, the array of integer + mapping codes (``c._fa``) is returned: >>> c = rt.Categorical([2, 2, 2, 1, 3], {"a": 1, "b": 2, "c": 3}) >>> c @@ -5999,7 +6058,8 @@ def expand_array(self) -> Union[np.ndarray, Tuple[np.ndarray, ...]]: >>> c._fa FastArray([2, 2, 2, 1, 3]) - Filtered string `Categorical` values are returned as the string "Filtered": + Filtered string :py:class:`~.rt_categorical.Categorical` values are returned as + the string ``Filtered``: >>> a = rt.FastArray(["a", "c", "b", "b", "c", "a"]) >>> f = rt.FastArray([False, False, True, True, True, True]) @@ -6011,8 +6071,8 @@ def expand_array(self) -> Union[np.ndarray, Tuple[np.ndarray, ...]]: >>> c.expand_array FastArray([b'Filtered', b'Filtered', b'b', b'b', b'c', b'a'], dtype='|S8') - Filtered integer `Categorical` values are returned as the integer - sentinel value: + Filtered integer :py:class:`~.rt_categorical.Categorical` values are returned as + the integer sentinel value: >>> a = rt.FastArray([1, 3, 2, 2, 3, 1]) >>> f = rt.FastArray([False, False, True, True, True, True]) @@ -6022,8 +6082,9 @@ def expand_array(self) -> Union[np.ndarray, Tuple[np.ndarray, ...]]: FastArray([0, 0, 2, 2, 3, 1], dtype=int8) Base Index: 1 FastArray([1, 2, 3]) Unique count: 3 >>> c.expand_array - FastArray([-2147483648, -2147483648, 2, 2, - 3, 1]) + FastArray([-9223372036854775808, -9223372036854775808, + 2, 2, + 3, 1]) """ if len(self) > 100_000: warnings.warn(f"Performance warning: re-expanding categorical of {len(self)} items.") diff --git a/riptable/rt_dataset.py b/riptable/rt_dataset.py index 2c0dbac..377d09f 100644 --- a/riptable/rt_dataset.py +++ b/riptable/rt_dataset.py @@ -19,6 +19,7 @@ Mapping, Optional, Sequence, + Sized, Tuple, Union, Literal, @@ -543,6 +544,8 @@ def _ensure_vector(self, vec): # ------------------------------------------------------------ def _check_addtype(self, name, value): # TODO use _possibly_convert -- why are these two routines different? + + # handle special case of Dataset value. if isinstance(value, Dataset): # if they try to add a dataset to a single column # then if the dataset has one column, use that @@ -559,19 +562,37 @@ def _check_addtype(self, name, value): raise TypeError(f"Cannot determine which column of Dataset to add to the Dataset column {name!r}.") return self._check_addtype(name, value) + # if not an ndarray, convert value into one. if not isinstance(value, np.ndarray): if isinstance(value, set): raise TypeError(f"Cannot create Dataset column {name!r} out of tuples or sets {value!r}.") - value = np.asanyarray(value) + # extract the length of the value, if it's an array-like + rowlen: Optional[int] = len(value) if isinstance(value, Sized) else None + + # if empty dataset, set compatible length from value if self._nrows is None: - if value.ndim > 0: + if rowlen is not None: self._nrows = len(value) else: # how to get here: # ds=Dataset() # ds[['g','c']]=3 self._nrows = 1 + + # if scalar, but with repeat method, broadcast into new array. + if rowlen is None and hasattr(value, "repeat"): + value = value.repeat(self._nrows) + + # else if single-element, then treat same as scalar broadcast case + elif rowlen == 1 and hasattr(value[0], "repeat"): + value = value[0].repeat(self._nrows) + + # otherwise convert to ndarray (downcasts any riptable types) + else: + value = np.asanyarray(value) + + # scalars get turned into 0-dim ndarrays, so broadcast into new array if value.ndim == 0: value = full(self._nrows, value) diff --git a/riptable/rt_datetime.py b/riptable/rt_datetime.py index 674c580..ebcc2e2 100644 --- a/riptable/rt_datetime.py +++ b/riptable/rt_datetime.py @@ -275,6 +275,8 @@ def strptime_to_nano(dtstrings, format, from_tz=None, to_tz="NYC"): """ Converts datetime string to DateTimeNano object with user-specified format. + |To see supported timezones, use ``rt.TimeZone.valid_timezones``.| + Parameters ---------- dtstrings : array of timestrings @@ -300,7 +302,7 @@ def strptime_to_nano(dtstrings, format, from_tz=None, to_tz="NYC"): * ``%S`` Second as a decimal number (with or without zero-padding). from_tz : str - The timezone of origin: 'NYC', 'GMT', 'DUBLIN', etc. + The timezone of origin. to_tz : str The timezone that the time will be displayed in. @@ -378,12 +380,15 @@ def _possibly_convert_cat(arr): def datetimestring_to_nano(dtstring, from_tz=None, to_tz="NYC"): """ Converts datetime string to DateTimeNano object. + By default, the timestrings are assumed to be in Eastern Time. If they are already in UTC time, set gmt=True. + |To see supported timezones, use ``rt.TimeZone.valid_timezones``.| + Parameters ---------- dtstring : array of timestrings in format YYYY-MM-DD HH:MM:SS, YYYYMMDD HH:MM:SS.ffffff, etc. (bytestrings/unicode supported) - from_tz : a string for the timezone of origin: 'NYC', 'GMT', 'DUBLIN', etc. + from_tz : a string for the timezone of origin. to_tz : a string for the timezone that the time will be displayed in returns DateTimeNano @@ -405,13 +410,16 @@ def datetimestring_to_nano(dtstring, from_tz=None, to_tz="NYC"): def datestring_to_nano(datestring, time=None, from_tz=None, to_tz="NYC"): """ Converts date string to DateTimeNano object (default midnight). + By default, the timestrings are assumed to be in Eastern Time. If they are already in UTC time, set gmt=True. + |To see supported timezones, use ``rt.TimeZone.valid_timezones``.| + Parameters ---------- datestring : array of datestrings in format YYYY-MM-DD or YYYYMMDD (bytestrings/unicode supported) time : a single string or array of strings in the format HH:MM:SS.ffffff (bytestrings/unicode supported) - from_tz : a string for the timezone of origin: 'NYC', 'GMT', 'DUBLIN', etc. + from_tz : a string for the timezone of origin. to_tz : a string for the timezone that the time will be displayed in returns DateTimenano @@ -452,15 +460,18 @@ def datestring_to_nano(datestring, time=None, from_tz=None, to_tz="NYC"): def timestring_to_nano(timestring, date=None, from_tz=None, to_tz="NYC"): """ Converts timestring to TimeSpan or DateTimeNano object. + By default, the timestrings are assumed to be in Eastern Time. If they are already in UTC time, set gmt=True. If a date is specified, a DateTimeNano object will be returned. If a date is not specified, a TimeSpan will be returned. + |To see supported timezones, use ``rt.TimeZone.valid_timezones``.| + Parameters ---------- timestring : array of timestrings in format HH:MM:SS, H:MM:SS, HH:MM:SS.ffffff (bytestrings/unicode supported) date : a single string or array of date strings in format YYYY-MM-DD (bytestrings/unicode supported) - from_tz : a string for the timezone of origin: 'NYC', 'GMT', 'DUBLIN', etc. + from_tz : a string for the timezone of origin. to_tz : a string for the timezone that the time will be displayed in returns TimeSpan or DateTimeNano @@ -507,12 +518,14 @@ def timestring_to_nano(timestring, date=None, from_tz=None, to_tz="NYC"): def parse_epoch(etime, to_tz="NYC"): """Days since epoch and milliseconds since midnight from nanosecond timestamps. + |To see supported timezones, use ``rt.TimeZone.valid_timezones``.| + Parameters ---------- etime : array-like UTC nanoseconds. to_tz : str, default 'NYC' - TimeZone short string - see riptable.rt_timezone. + TimeZone short string. This routine didn't used to take a timezone, so it defaults to the previous setting. Used in the phonyx data loader. @@ -4110,6 +4123,8 @@ class DateTimeNano(DateTimeBase, TimeStampBase, DateTimeCommon): accounting for Daylight Saving Time. The exception is when `arr` is an array of `Date` objects, in which case the default display timezone is UTC. + |To see supported timezones, use ``rt.TimeZone.valid_timezones``.| + Parameters ---------- arr : array of `int`, `str`, `Date`, `TimeSpan`, :py:class:`~datetime.datetime`, `numpy.datetime64` @@ -4144,16 +4159,6 @@ class DateTimeNano(DateTimeBase, TimeStampBase, DateTimeCommon): created from strings, and recommended in other cases to ensure expected results. The default `from_tz` is "UTC" for all `arr` types except strings, for which a `from_tz` must be specified. - - Timezones supported (Daylight Saving Time is accounted for): - - - "America/New_York" - - "Australia/Sydney" - - "Europe/Dublin" - - "DUBLIN": alias for "Europe/Dublin" - - "GMT": Greenwich Mean Time - - "NYC": US/Eastern - - "UTC": (not a timezone, but accepted as an alias for GMT) to_tz : str The timezone the data is displayed in. If `arr` is `Date` objects, the default `to_tz` is "UTC". For other `arr` types, the default `to_tz` is "NYC". @@ -4351,230 +4356,6 @@ class DateTimeNano(DateTimeBase, TimeStampBase, DateTimeCommon): # ------------------------------------------------------------ def __new__(cls, arr, from_tz=None, to_tz=None, from_matlab=False, format=None, start_date=None, gmt=None): - """ - Date and timezone-aware time information, stored to nanosecond precision. - - `DateTimeNano` arrays have an underlying `~riptable.int64` array representing the - number of nanoseconds since the Unix epoch (00:00:00 UTC on 01-01-1970). Dates - before the Unix epoch are invalid. - - In most cases, `DateTimeNano` objects default to display in Eastern/NYC time, - accounting for Daylight Saving Time. The exception is when `arr` is an array of - `Date` objects, in which case the default display timezone is UTC. - - Parameters - ---------- - arr : array of `int`, `str`, `Date`, `TimeSpan`, :py:class:`~datetime.datetime`, `numpy.datetime64` - Datetimes to store in the `DateTimeNano` array. - - - Integers represent nanoseconds since the Unix epoch (00:00:00 UTC on - 01-01-1970). - - Datetime strings can generally be in YYYYMMDD HH:MM:SS.fffffffff format - without ``format`` codes needing to be specified. Bytestrings, unicode - strings, and strings in `ISO 8601 `_ - format are supported. If your strings are in another format (for example, - MMDDYY), specify it with ``format``. Other notes for string input: - - - `from_tz` is required. - - - If `start_date` is provided, strings are parsed as `TimeSpan` - objects before `start_date` is applied. See how this affects output in the - Examples section below. - - - For NumPy vs. Riptable string parsing differences, see the Notes section - below. - - - For `Date` objects, both `from_tz` and `to_tz` are "UTC" by - default. - - For `TimeSpan` objects, `start_date` needs to be specified. - - Using the `DateTimeNano` constructor is recommended for - `Date` + `TimeSpan` operations. - - `numpy.datetime64` values are converted to nanoseconds. - - from_tz : str - The timezone the data in `arr` is stored in. Required if the `DateTimeNano` is - created from strings, and recommended in other cases to ensure expected results. - The default `from_tz` is "UTC" for all `arr` types except strings, for which a - `from_tz` must be specified. - - Timezones supported (Daylight Saving Time is accounted for): - - - "America/New_York" - - "Australia/Sydney" - - "Europe/Dublin" - - "DUBLIN": alias for "Europe/Dublin" - - "GMT": Greenwich Mean Time - - "NYC": US/Eastern - - "UTC": (not a timezone, but accepted as an alias for GMT) - to_tz : str - The timezone the data is displayed in. If `arr` is `Date` objects, the default - `to_tz` is "UTC". For other `arr` types, the default `to_tz` is "NYC". - from_matlab : bool, default False - When set to `True`, indicates that `arr` contains Matlab datenums (the number - of days since 0-Jan-0000). Because Matlab datenums may also include a fraction - of a day, be sure to specify `from_tz` for accurate time data. - format : str - Specify a format for string `arr` input. For format codes, see the `Python - strptime cheatsheet `_. This parameter is ignored for - non-string `arr` input. - start_date : `str` or array of `Date` - - Required if constructing a `DateTimeNano` from a `TimeSpan`. - - If `arr` is strings, the values in `arr` are parsed as `TimeSpan` objects - before `start_date` is applied. See how this affects output in the Examples - section below. Otherwise, `start_date` is added (as nanos) to dates in `arr`. - - If `start_date` is a string, use YYYYMMDD format. - - If `start_date` is a `Date` array, it is broadcast to `arr` if possible; - otherwise an error is raised. - - A `start_date` before the Unix epoch is converted to the Unix epoch. - - Notes - ----- - - The constructor does not attempt to preserve NaN times from Python - :py:class:`~datetime.datetime` objects. - - If the integer data in a `DateTimeNano` object is extracted, it is in the - `from_tz` timezone. To initialize another `DateTimeNano` with the same underlying - array, use the same `from_tz`. - - `DateTimeNano` objects have no knowledge of timezones. All timezone operations - are handled by the `TimeZone` class. - - - Math Operations - - The following math operations can be performed: - - +----------------------------------------+ - | Date + TimeSpan = DateTimeNano | - +----------------------------------------+ - | Date - DateTimeNano = TimeSpan | - +----------------------------------------+ - | Date - TimeSpan = DateTimeNano | - +----------------------------------------+ - | DateTimeNano - DateTimeNano = TimeSpan | - +----------------------------------------+ - | DateTimeNano - Date = TimeSpan | - +----------------------------------------+ - | DateTimeNano - TimeSpan = DateTimeNano | - +----------------------------------------+ - | DateTimeNano + TimeSpan = DateTimeNano | - +----------------------------------------+ - - String Parsing Differences Between NumPy and Riptable - - - Riptable `DateTimeNano` string parsing is generally more forgiving than NumPy's - `numpy.datetime64` array parsing. - - In some cases where NumPy raises an error, Riptable returns an object. - - The lower limit for `DateTimeNano` string parsing is Unix epoch time. - - You can always guarantee that Riptable and NumPy get the same results by using - the full `ISO 8601 `_ datetime format - (YYYY-MM-DDTHH:MM:SS.fffffffff). - - Riptable parses strings without leading zeros: - - >>> import numpy as np - >>> rt.DateTimeNano(["2018-1-1"], from_tz="NYC") - DateTimeNano(['20180101 00:00:00.000000000'], to_tz='NYC') - >>> np.array(["2018-1-1"], dtype="datetime64[ns]") - ValueError: Error parsing datetime string "2018-1-1" at position 5 - - Riptable handles extra trailing spaces; NumPy incorrectly treats them as a - timezone whose parsing will be deprecated soon: - - >>> rt.DateTimeNano(["2018-10-11 10:11:00.123 "], from_tz="NYC") - DateTimeNano(['20181011 10:11:00.123000000'], to_tz='NYC') - >>> np.array(["2018-10-11 10:11:00.123 "], dtype="datetime64[ns]") - DeprecationWarning: parsing timezone aware datetimes is deprecated; this will - raise an error in the future - array(['2018-10-11T10:11:00.123000000'], dtype='datetime64[ns]') - - Riptable correctly parses dates without delimiters: - - >>> rt.DateTimeNano(["20181231"], from_tz="NYC") - DateTimeNano(['20181231 00:00:00.000000000'], to_tz='NYC') - >>> np.array(["20181231"], dtype="datetime64[ns]") - array(['1840-08-31T19:51:12.568664064'], dtype='datetime64[ns]') - - To ensure that Riptable and NumPy get the same results, use the full - `ISO 8601 `_ datetime format: - - >>> rt.DateTimeNano(["2018-12-31T12:34:56.789123456"], from_tz="NYC") - DateTimeNano(['20181231 12:34:56.789123456'], to_tz='NYC') - >>> np.array(["2018-12-31T12:34:56.789123456"], dtype="datetime64[ns]") - array(['2018-12-31T12:34:56.789123456'], dtype='datetime64[ns]') - - See Also - -------- - DateTimeNano.info : See timezone info for a `DateTimeNano` object. - Date : Riptable's `Date` class. - DateSpan : Riptable's `DateSpan` class. - TimeSpan : Riptable's `TimeSpan` class. - .TimeZone : Riptable's `.TimeZone` class. - - Examples - -------- - Create a `DateTimeNano` from an integer representing the nanoseconds since 00:00:00 - UTC on 01-01-1970: - - >>> rt.DateTimeNano([1514828730123456000], from_tz="UTC") - DateTimeNano(['20180101 12:45:30.123456000'], to_tz='NYC') - - From a datetime string in NYC time: - - >>> rt.DateTimeNano(["2018-01-01 12:45:30.123456000"], from_tz="NYC") - DateTimeNano(['20180101 12:45:30.123456000'], to_tz='NYC') - - From `numpy.datetime64` array (note that NumPy has less precision): - - >>> dt = np.array(["2018-11-02 09:30:00.002201", "2018-11-02 09:30:00.004212"], dtype="datetime64[ns]") - >>> rt.DateTimeNano(dt, from_tz="NYC") - DateTimeNano(['20181102 09:30:00.002201000', '20181102 09:30:00.004212000'], to_tz='NYC') - - If your datetime strings are nonstandard, specify the format using ``format`` with - `Python strptime codes `_. - - >>> rt.DateTimeNano(["12/31/19 08:05:01", "6/30/19 14:20:35"], format="%m/%d/%y %H:%M:%S", from_tz="NYC") - DateTimeNano(['20191231 08:05:01.000000000', '20190630 14:20:35.000000000'], to_tz='NYC') - - Convert Matlab datenums: - - >>> rt.DateTimeNano([737426, 738251.75], from_matlab=True, from_tz="NYC") - DateTimeNano(['20190101 00:00:00.000000000', '20210405 18:00:00.000000000'], to_tz='NYC') - - Note that if you create a `DateTimeNano` by adding a `Date` and a `TimeSpan` without - using the `DateTimeNano` constructor, `from_tz` and `to_tz` will be "GMT": - - >>> d = rt.Date("20230305") - >>> ts = rt.TimeSpan("05:00") - >>> dtn = d + ts - >>> dtn.info() - DateTimeNano(['20230305 05:00:00.000000000'], to_tz='GMT') - Displaying in timezone: GMT - Origin: GMT - Offset: 0 hours - - Create a `DateTimeNano` from a list of Python :py:class:`~datetime.datetime` - objects: - - >>> from datetime import datetime as dt - >>> pdt = [dt(2018, 7, 2, 14, 30), dt(2019, 6, 8, 8, 30)] - >>> rt.DateTimeNano(pdt) - UserWarning: FastArray contains an unsupported type 'object'. Problems may occur. - Consider categoricals. - warnings.warn(warning_string) - DateTimeNano(['20180702 10:30:00.000000000', '20190608 04:30:00.000000000', to_tz='NYC') - - If you specify a `start_date` with an `arr` of strings, the strings are parsed as - `TimeSpan` objects before `start_date` is applied. Note the first two examples in - ``arr`` result in NaN TimeSpans, which are silently treated as zeros: - - >>> arr = ["20180205", "20180205 14:30", "14:30"] - >>> rt.DateTimeNano(arr, from_tz="UTC", to_tz="UTC", start_date="20230601") - DateTimeNano(['20230601 00:00:00.000000000', '20230601 00:00:00.000000000', '20230601 14:30:00.000000000'], to_tz='UTC') - - `.GetNanoTime` gets the current Unix epoch time: - - >>> rt.DateTimeNano([rt.GetNanoTime()], from_tz="UTC") - DateTimeNano(['20230615 18:36:58.378020700'], to_tz='NYC') - """ # changing defaults / requirments based on constructor # non-string constructors don't require from_tz keyword to be set # need to store original keyword values to check in the funnel (saving all in case we add more) @@ -4800,10 +4581,12 @@ def set_timezone(self, tz): Different lookup array will be used for daylight savings fixups. Does not modify the underlying array. + |To see supported timezones, use ``rt.TimeZone.valid_timezones``.| + Parameters ---------- tz : str - Abbreviated name of desired timezone. See rt.TimeZone.valid_timezones + Abbreviated name of desired timezone. Examples -------- @@ -4829,12 +4612,15 @@ def set_timezone(self, tz): def astimezone(self, tz): """ Returns a new DateTimeNano object in a different displayed timezone. + The new object holds a reference to the same underlying array. + |To see supported timezones, use ``rt.TimeZone.valid_timezones``.| + Parameters ---------- tz : str - Abbreviated name of desired timezone. See rt.TimeZone.valid_timezones + Abbreviated name of desired timezone. Returns ------- @@ -5985,16 +5771,16 @@ def random(cls, sz, to_tz="NYC", from_tz="NYC", inv=None, start=None, end=None): If `start` and `end` are not provided, years range from 1971 to 2020. + |To see supported timezones, use ``rt.TimeZone.valid_timezones``.| + Parameters ---------- sz : int The length of the generated array. to_tz : str, default 'NYC' - The timezone for display. For valid timezone options, see - :py:attr:`.TimeZone.valid_timezones`. + The timezone for display. from_tz : str, default 'NYC' - The timezone of origin. For valid timezone options, see - :py:attr:`.TimeZone.valid_timezones`. + The timezone of origin. inv : array of bool, optional Where True, an invalid `DateTimeNano` is in the returned array. start : int, optional @@ -6047,16 +5833,16 @@ def random_invalid(cls, sz, to_tz="NYC", from_tz="NYC", start=None, end=None): If `start` and `end` are not provided, years for valid `DateTimeNano` values range from 1971 to 2020. + |To see supported timezones, use ``rt.TimeZone.valid_timezones``.| + Parameters ---------- sz : int The length of the generated array. to_tz : str, default 'NYC' - The timezone for display. For valid timezone options, see - :py:attr:`.TimeZone.valid_timezones`. + The timezone for display. from_tz : str, default 'NYC' - The timezone of origin. For valid timezone options, see - :py:attr:`.TimeZone.valid_timezones`. + The timezone of origin. start : int, optional The start year for the range. If no end year is provided, all times are within the start year. diff --git a/riptable/rt_display.py b/riptable/rt_display.py index 7bf8e50..2616d79 100644 --- a/riptable/rt_display.py +++ b/riptable/rt_display.py @@ -1705,8 +1705,36 @@ def join_row_section(rowstrings, idx=None): # html_string_list.append("") + custom_table_css = "" + if DisplayOptions.HTML_CUSTOM_TABLE_CSS is not None: + css_list = ( + DisplayOptions.HTML_CUSTOM_TABLE_CSS + if isinstance(DisplayOptions.HTML_CUSTOM_TABLE_CSS, list) + else [DisplayOptions.HTML_CUSTOM_TABLE_CSS] + ) + custom_table_css = "\n".join(f" {css_prefix} {css}" for css in css_list) + html_string_list.append( - f"" + f""" + + + + + """ ) html_string_list.append(f"") diff --git a/riptable/rt_merge.py b/riptable/rt_merge.py index 62e0a47..4d493ce 100644 --- a/riptable/rt_merge.py +++ b/riptable/rt_merge.py @@ -2885,49 +2885,66 @@ def merge( hint_size: int = 0, ): """ - Merge Dataset by performing a database-style join operation by columns. + Merge :py:class:`~.rt_dataset.Dataset` objects by performing a database-style join + operation by columns. Parameters ---------- - left : Dataset - Left Dataset - right : Dataset - Right Dataset + left : :py:class:`~.rt_dataset.Dataset` + Left :py:class:`~.rt_dataset.Dataset`. + right : :py:class:`~.rt_dataset.Dataset` + Right :py:class:`~.rt_dataset.Dataset`. on : str or list of str, optional - Column names to join on. Must be found in both `left` and `right`. + Column names to join on. Must be found in both ``left`` and ``right``. left_on : str or list of str, optional - Column names from left Dataset to join on. When specified, overrides whatever is specified in `on`. + Column names from ``left`` to join on. When specified, overrides whatever is specified in ``on``. right_on : str or list of str, optional - Column names from right to join on. When specified, overrides whatever is specified in `on`. - how : {'left','right', 'inner', 'outer'} - - left: use only keys from the left. **The output rows will be in one-to-one correspondence with the left rows!** If multiple matches on the right occur, the last is taken. - - right: use only keys from the right. **The output rows will be in one-to-one correspondence - with the left rows!** If multiple matches on the left occur, the last is taken. - - inner: use intersection of keys from both Datasets, similar to SQL inner join - - outer: use union of keys from both Datasets, similar to SQL outer join + Column names from ``right`` to join on. When specified, overrides whatever is specified in ``on``. + how : {"left", "right", "inner", "outer"}, default "left" + - "left": Uses all of the ``left`` keys to find matches. If ``left`` has no + match in ``right``, invalid or empty values are filled in. If there are + multiple matches in ``right``, the first match is included in the returned + :py:class:`~.rt_dataset.Dataset`. + - "right": Uses all of the ``right`` keys to find matches. If ``right`` has no + match in ``left``, invalid or empty values are filled in. If there are + multiple matches in ``left``, the first match is included in the returned + :py:class:`~.rt_dataset.Dataset`. + - "inner": Similar to a SQL inner join. Uses the intersection of keys from both + :py:class:`~.rt_dataset.Dataset` objects to find matches, so only rows with + matching key values are in the returned :py:class:`~.rt_dataset.Dataset`. + If there are multiple matches, the first match is included in the returned + :py:class:`~.rt_dataset.Dataset`. + - "outer": Similar to a SQL outer join. Uses the union of keys from both + :py:class:`~.rt_dataset.Dataset` objects to find matches. For rows that don't + have matches, invalid and empty values are filled in. If there are multiple + matches, the first match is included in the returned + :py:class:`~.rt_dataset.Dataset`. suffixes: tuple of (str, str), default ('_x', '_y') - Suffix to apply to overlapping column names in the left and right side, respectively. - To raise an exception on overlapping columns use (False, False). - indicator : bool or str, default False - If True, adds a column to output Dataset called "merge_indicator" with information on the - source of each row. If string, column with information on source of each row will be added - to output Dataset, and column will be named value of string. Information column is - Categorical-type and takes on a value of "left_only" for observations whose merge key only - appears in `left` Dataset, "right_only" for observations whose merge key only appears in - `right` Dataset, and "both" if the observation's merge key is found in both. + Suffix to apply to overlapping column names in ``left`` and ``right``, respectively. + To raise an exception on overlapping columns use ``(False, False)``. + indicator : bool or str, default `False` + If `True`, adds a column to the returned :py:class:`~.rt_dataset.Dataset` called + "merge_indicator" with information on the source of each row. If string, column + with information on source of each row is added to the returned + :py:class:`~.rt_dataset.Dataset`, and column is named value of string. + Information column is Categorical-type and takes on a value of ``"left_only"`` + for observations whose merge key only appears in ``left``, ``"right_only"`` for + observations whose merge key only appears in ``right``, and ``"both"`` if the + observation's merge key is found in both. columns_left : str or list of str, optional - Column names to include in the merge from `left`, defaults to None which causes all columns to be included. + Column names to include in the merge from ``left``, defaults to `None` which causes all columns to be included. columns_right : str or list of str, optional - Column names to include in the merge from `right`, defaults to None which causes all columns to be included. - verbose : boolean - For the stdout debris, defaults to False + Column names to include in the merge from ``right``, defaults to `None` which causes all columns to be included. + verbose : boolean, default `False` + For the stdout debris hint_size : int An estimate of the number of unique keys used for the join, to optimize performance by pre-allocating memory for the key hash table. Returns ------- - merged : Dataset + merged : :py:class:`~.rt_dataset.Dataset` + The merged :py:class:`~.rt_dataset.Dataset`. Examples -------- @@ -2940,7 +2957,7 @@ def merge( [3 rows x 4 columns] total bytes: 72.0 B - Demonstrating a 'left' merge. + Demonstrating a ``"left"`` merge. >>> rt.merge(ds_complex_1, ds_complex_2, on = ['A','B'], how = 'left') # B A C E @@ -2954,7 +2971,8 @@ def merge( See Also -------- - merge_asof + :py:func:`.rt_merge.merge_asof` : Combine two :py:class:`~.rt_dataset.Dataset` + objects by performing a database-style left-join based on the nearest numeric key. """ # Collect timing stats on how long various stages of the merge operation take. start = GetNanoTime() diff --git a/riptable/rt_numpy.py b/riptable/rt_numpy.py index 946741a..ff4b764 100644 --- a/riptable/rt_numpy.py +++ b/riptable/rt_numpy.py @@ -121,6 +121,7 @@ "tile", "transpose", "trunc", + "unique", "unique32", "var", "vstack", @@ -508,13 +509,18 @@ def empty(shape, dtype: Union[str, np.dtype, type] = float, order: str = "C") -> """ Return a new array of specified shape and type, without initializing entries. + Unlike :py:func:`~.rt_numpy.zeros`, :py:func:`~.rt_numpy.empty` doesn't set the + array values to zero, so it may be marginally faster. On the other hand, it requires + the user to manually set all the values in the array, so it should be used with + caution. + Parameters ---------- shape : int or tuple of int Shape of the empty array, e.g., ``(2, 3)`` or ``2``. Note that although multi-dimensional arrays are technically supported by Riptable, you may get unexpected results when working with them. - dtype : str or NumPy dtype or Riptable dtype, default `numpy.float64` + dtype : str or :py:class:`numpy.dtype` or Riptable dtype, default :py:obj:`numpy.float64` The desired data type for the array. order : {'C', 'F'}, default 'C' Whether to store multi-dimensional data in row-major (C-style) or @@ -522,27 +528,35 @@ def empty(shape, dtype: Union[str, np.dtype, type] = float, order: str = "C") -> Returns ------- - `FastArray` - A new `FastArray` of uninitialized (arbitrary) data of the specified - shape and type. + :py:class:`~.rt_fastarray.FastArray` + A new :py:class:`~.rt_fastarray.FastArray` of uninitialized (arbitrary) data of + the specified shape and type. See Also -------- - riptable.empty_like, riptable.ones, riptable.ones_like, riptable.zeros, - riptable.zeros_like, riptable.empty, riptable.full, Categorical.full - - Notes - ----- - Unlike `zeros`, `empty` doesn't set the array values to zero, so it may - be marginally faster. On the other hand, it requires the user to manually - set all the values in the array, so it should be used with caution. + :py:func:`.rt_numpy.empty_like` + :py:func:`.rt_numpy.ones` + :py:func:`.rt_numpy.ones_like` + :py:func:`.rt_numpy.zeros` + :py:func:`.rt_numpy.zeros_like` + :py:func:`.rt_numpy.empty` + :py:func:`.rt_numpy.full` + :py:meth:`.rt_categorical.Categorical.full` Examples -------- - >>> rt.empty(5) + >>> rt.empty(5) # doctest: +SKIP FastArray([0. , 0.25, 0.5 , 0.75, 1. ]) # uninitialized - >>> rt.empty(5, dtype = int) + Note that the results from :py:func:`~.rt_numpy.empty` vary, given that the + entries in the resulting :py:class:`~.rt_fastarray.FastArray` objects are uninitialized. + For example: + + >>> rt.empty(5) # doctest: +SKIP + FastArray([3.21142670e-322, 0.00000000e+000, 1.42173718e-312, + 2.48273508e-312, 2.46151512e-312]) # uninitialized + + >>> rt.empty(5, dtype=int) # doctest: +SKIP FastArray([80288976, 0, 0, 0, 1]) # uninitialized """ # return LedgerFunction(np.empty, shape, dtype=dtype, order=order) @@ -577,46 +591,54 @@ def empty_like( Parameters ---------- array : array - The shape and data type of `array` define the same attributes of the + The shape and data type of ``array`` define the same attributes of the returned array. Note that although multi-dimensional arrays are technically supported by Riptable, you may get unexpected results when working with them. - dtype : str or NumPy dtype or Riptable dtype, optional + dtype : str or :py:class:`numpy.dtype` or Riptable dtype, optional Overrides the data type of the result. order : {'K', C', 'F', or 'A'}, default 'K' Overrides the memory layout of the result. 'K' (the default) means - match the layout of `array` as closely as possible. 'C' means + match the layout of ``array`` as closely as possible. 'C' means row-major (C-style); 'F' means column-major (Fortran-style); 'A' - means 'F' if `array` is Fortran-contiguous, 'C' otherwise. - subok : bool, default True - If True (the default), then the newly created array will use the - sub-class type of `array`, otherwise it will be a base-class array. + means 'F' if ``array`` is Fortran-contiguous, 'C' otherwise. + subok : bool, default `True` + If `True` (the default), then the newly created array uses the + sub-class type of ``array``, otherwise it is a base-class array. shape : int or sequence of ints, optional - Overrides the shape of the result. If order='K' and the number of - dimensions is unchanged, it will try to keep the same order; otherwise, - order='C' is implied. Note that although multi-dimensional arrays are + Overrides the shape of the result. If ``order='K'`` and the number of + dimensions is unchanged, it tries to keep the same order; otherwise, + ``order='C'`` is implied. Note that although multi-dimensional arrays are technically supported by Riptable, you may get unexpected results when working with them. Returns ------- - `FastArray` - A new `FastArray` of uninitialized (arbitrary) data with the same shape - and type as `array`. + :py:class:`~.rt_fastarray.FastArray` + A new :py:class:`~.rt_fastarray.FastArray` of uninitialized (arbitrary) data + with the same shape and type as ``array``. See Also -------- - riptable.empty, riptable.ones, riptable.ones_like, riptable.zeros, - riptable.zeros_like, riptable.full, Categorical.full + :py:func:`.rt_numpy.empty` + :py:func:`.rt_numpy.ones` + :py:func:`.rt_numpy.ones_like` + :py:func:`.rt_numpy.zeros` + :py:func:`.rt_numpy.zeros_like` + :py:func:`.rt_numpy.full` + :py:meth:`.rt_categorical.Categorical.full` Examples -------- >>> a = rt.FastArray([1, 2, 3, 4]) - >>> rt.empty_like(a) - FastArray([ 1814376192, 1668069856, -1994737310, 746250422]) # uninitialized + >>> rt.empty_like(a) # doctest: +SKIP + FastArray([1, 2, 4, 7]) # uninitialized + + Note that the results from :py:func:`~.rt_numpy.empty_like` vary, given that the + entries from the resulting :py:class:`~.rt_fastarray.FastArray` objects are uninitialized. - >>> rt.empty_like(a, dtype = float) - FastArray([0.25, 0.5 , 0.75, 1. ]) # uninitialized + >>> rt.empty_like(a, dtype=float) # doctest: +SKIP + FastArray([0. , 0. , 6.4, 4.8]) # uninitialized """ # TODO: call recycler @@ -659,9 +681,9 @@ def searchsorted(a, v, side="left", sorter=None) -> int: # ------------------------------------------------------- def issorted(*args) -> bool: """ - Return True if the array is sorted, False otherwise. + Return `True` if the array is sorted, `False` otherwise. - NaNs at the end of an array are considered sorted. + ``NaN`` values at the end of an array are considered sorted. Parameters ---------- @@ -671,11 +693,11 @@ def issorted(*args) -> bool: Returns ------- bool - True if the array is sorted, False otherwise. + `True` if the array is sorted, `False` otherwise. See Also -------- - FastArray.issorted + :py:meth:`.rt_fastarray.FastArray.issorted` Examples -------- @@ -723,35 +745,35 @@ def unique( Input array, or a list of arrays that are the same shape. If a list of arrays is provided, it's treated as a multikey in which the arrays' values at corresponding indices are associated. - return_index : bool, default False - If True, also return the indices of the first occurrences of the unique values + return_index : bool, default `False` + If `True`, also return the indices of the first occurrences of the unique values (for one input array) or unique combinations (for multiple input arrays) in - `arr`. - return_inverse : bool, default False - If True, also return the indices of the unique array (for one input array) or - combinations (for multiple input arrays) that can be used to reconstruct `arr`. - return_counts : bool, default False - If True, also return the number of times each unique item (for one input array) - or combination (for multiple input arrays) appears in `arr`. - sorted : bool, default True - Indicates whether the results are returned in sorted order. Defaults to True, - which replicates the behavior of the NumPy version of this function. When False - (which is often faster), the display order is first appearance. - If `lex` is set to True, the value of this parameter is ignored and the results - are always returned in sorted order. - lex : bool, default False + ``arr``. + return_inverse : bool, default `False` + If `True`, also return the indices of the unique array (for one input array) or + combinations (for multiple input arrays) that can be used to reconstruct ``arr``. + return_counts : bool, default `False` + If `True`, also return the number of times each unique item (for one input array) + or combination (for multiple input arrays) appears in ``arr``. + sorted : bool, default `True` + Indicates whether the results are returned in sorted order. Defaults to `True`, + which replicates the behavior of the NumPy version of this function. When `False` + (which is often faster), the display order is first appearance. If ``lex`` is set + to `True`, the value of this parameter is ignored and the results are always + returned in sorted order. + lex : bool, default `False` Controls whether the function uses hashing- or sorting-based logic to find the - unique values in `arr`. Defaults to False (hashing). Set to True to use a - lexicographical sort instead; this can be faster when `arr` is a large array + unique values in ``arr``. Defaults to `False` (hashing). Set to `True` to use a + lexicographical sort instead; this can be faster when ``arr`` is a large array with a relatively high proportion of unique values. - dtype : {None, 'b', 'B', 'h', 'H', 'i', 'I', 'l', 'L', 'q', 'Q', 'p', 'P'} default None - If an index is returned via `return_index` or `return_inverse`, you can use a - NumPy data type character code to specify the data type of the returned index. + dtype : {None, 'b', 'B', 'h', 'H', 'i', 'I', 'l', 'L', 'q', 'Q', 'p', 'P'}, default `None` + If an index is returned via ``return_index`` or ``return_inverse``, you can use + a NumPy data type character code to specify the data type of the returned index. For definitions of the character codes for integer types, see :ref:`arrays.scalars.character-codes`. - filter: ndarray of bool, default None - If provided, any False values will be ignored in the calculation. If provided - and `return_inverse` is True, a filtered-out location will be -1. + filter : ndarray of bool, default `None` + If provided, any `False` values are ignored in the calculation. If provided + and ``return_inverse`` is `True`, a filtered-out location is -1. Returns ------- @@ -759,31 +781,31 @@ def unique( For one input array, one array is returned that contains the unique values. For multiple input arrays, a list of arrays is returned that collectively contains every unique combination of values found in the arrays' corresponding indices. - unique_indices : `FastArray`, optional + unique_indices : :py:class:`~.rt_fastarray.FastArray`, optional The indices of the first occurrences of the unique values in the original array. - Only provided if `return_index` is True. - unique_inverse : `FastArray`, optional + Only provided if ``return_index`` is `True`. + unique_inverse : :py:class:`~.rt_fastarray.FastArray`, optional The indices of the unique array (for one input array) or unique combinations - (for multiple input arrays) that can be used to reconstruct `arr`. Only provided - if `return_inverse` is True. - unique_counts : `FastArray`, optional + (for multiple input arrays) that can be used to reconstruct ``arr``. Only + provided if ``return_inverse`` is `True`. + unique_counts : :py:class:`~.rt_fastarray.FastArray`, optional The number of times each of the unique values comes up in the original array. - Only provided if `return_counts` is True. + Only provided if ``return_counts`` is `True`. Notes ----- - ``rt.unique`` often performs faster than ``np.unique`` for strings and numeric - types. + :py:func:`~.rt_numpy.unique` often performs faster than :py:func:`numpy.unique` for + strings and numeric types. - `Categorical` objects passed in as `arr` will ignore the `sorted` flag and return - their current order. + :py:class:`~.rt_categorical.Categorical` objects passed in as ``arr`` ignore the ``sorted`` + flag and return their current order. Examples -------- >>> rt.unique(['b','b','a','d','d']) FastArray(['a', 'b', 'd'], dtype='>> rt.unique(['b','b','a','d','d'], sorted = False) @@ -806,7 +828,7 @@ def unique( >>> u FastArray([b'a', b'b', b'c'], dtype='|S1') >>> indices - FastArray([0, 1, 3], dtype=int64) + FastArray([0, 1, 3]) >>> a[indices] FastArray([b'a', b'b', b'c'], dtype='|S1') @@ -833,7 +855,7 @@ def unique( >>> values FastArray([1, 2, 3, 4, 6]) >>> counts - FastArray([1, 3, 1, 1, 1]) + FastArray([1, 3, 1, 1, 1], dtype=int32) >>> rt.repeat(values, counts) FastArray([1, 2, 2, 2, 3, 4, 6]) """ @@ -2316,17 +2338,24 @@ def any(*args, **kwargs) -> bool: # ------------------------------------------------------- -def arange(*args, **kwargs) -> "FastArray": +def arange( + start: Union[int, float] = None, + stop: Optional[Union[int, float]] = None, + step: Union[int, float] = 1, + *, + dtype: Optional[npt.Dtype] = None, + like: npt.ArrayLike = None, +) -> "FastArray": """ Return an array of evenly spaced values within a specified interval. - The half-open interval includes `start` but excludes `stop`: ``[start, stop)``. + The half-open interval includes ``start`` but excludes ``stop``: ``[start, stop)``. For integer arguments the function is roughly equivalent to the Python - built-in :py:obj:`range`, but returns a `FastArray` rather than a - :py:obj:`range` instance. + built-in :py:obj:`range`, but returns a :py:class:`~.rt_fastarray.FastArray` rather + than a :py:obj:`range` instance. - When using a non-integer step, such as 0.1, it's often better to use + When using a non-integer ``step``, such as 0.1, it's often better to use :py:func:`numpy.linspace`. For additional warnings, see :py:func:`numpy.arange`. @@ -2337,36 +2366,42 @@ def arange(*args, **kwargs) -> "FastArray": Start of interval. The interval includes this value. stop : int or float End of interval. The interval does not include this value, except in - some cases where `step` is not an integer and floating point round-off + some cases where ``step`` is not an integer and floating point round-off affects the length of the output. step : int or float, default 1 - Spacing between values. For any output `out`, this is the distance - between two adjacent values: ``out[i+1] - out[i]``. If `step` - is specified as a positional argument, `start` must also be given. - dtype : str or NumPy dtype or Riptable dtype, optional - The type of the output array. If `dtype` is not given, the data type + Spacing between values. For any output ``out``, this is the distance + between two adjacent values: ``out[i+1] - out[i]``. If ``step`` + is specified as a positional argument, ``start`` must also be given. + dtype : str or :py:class:`numpy.dtype` or Riptable dtype, optional + The type of the output array. If ``dtype`` is not given, the data type is inferred from the other input arguments. like : array_like, optional Reference object to allow the creation of arrays that are not NumPy - arrays. If an array-like passed in as `like` supports the - ``__array_function__`` protocol, the result will be defined by it. + arrays. If an array-like passed in as ``like`` supports the + ``__array_function__`` protocol, the result is defined by it. In this case, it ensures the creation of an array object compatible with that passed in via this argument. Returns ------- - `FastArray` - A `FastArray` of evenly spaced numbers within the specified interval. - For floating point arguments, the length of the result is - ``ceil((stop - start)/step)``. Because of floating point overflow, - this rule may result in the last element of the output being greater - than `stop`. + :py:class:`~.rt_fastarray.FastArray` + A :py:class:`~.rt_fastarray.FastArray` of evenly spaced numbers within the + specified interval. For floating point arguments, the length of the result is + ``ceil((stop - start)/step)``. Because of floating point overflow, this rule may + result in the last element of the output being greater than ``stop``. See Also -------- - numpy.arange, riptable.ones, riptable.ones_like, riptable.zeros, - riptable.zeros_like, riptable.empty, riptable.empty_like, riptable.full, - riptable.arange, Categorical.full + :py:func:`numpy.arange` + :py:func:`.rt_numpy.ones` + :py:func:`.rt_numpy.ones_like` + :py:func:`.rt_numpy.zeros` + :py:func:`.rt_numpy.zeros_like` + :py:func:`.rt_numpy.empty` + :py:func:`.rt_numpy.empty_like` + :py:func:`.rt_numpy.full` + :py:func:`.rt_numpy.arange` + :py:meth:`.rt_categorical.Categorical.full` Examples -------- @@ -2374,7 +2409,7 @@ def arange(*args, **kwargs) -> "FastArray": FastArray([0, 1, 2]) >>> rt.arange(3.0) - FastArray([ 0., 1., 2.]) + FastArray([0., 1., 2.]) >>> rt.arange(3, 7) FastArray([3, 4, 5, 6]) @@ -2382,7 +2417,17 @@ def arange(*args, **kwargs) -> "FastArray": >>> rt.arange(3, 7, 2) FastArray([3, 5]) """ - return LedgerFunction(np.arange, *args, **kwargs) + kwargs = {} + # Avoid passing thru default 'like' (https://github.com/numpy/numpy/issues/22069, fixed in NumPy-1.24) + if like is not None: + kwargs["like"] = like + + if start is None: + if stop is None: + return np.arange(step=step, dtype=dtype, **kwargs) # always an error + return LedgerFunction(np.arange, stop, step=step, dtype=dtype, **kwargs) + + return LedgerFunction(np.arange, start, stop=stop, step=step, dtype=dtype, **kwargs) # ------------------------------------------------------- @@ -2475,24 +2520,29 @@ def full(shape, fill_value, dtype=None, order="C") -> FastArray: you may get unexpected results when working with them. fill_value : scalar or array Fill value. For 1-dimensional arrays, only scalar values are accepted. - dtype : str or NumPy dtype or Riptable dtype, optional + dtype : str or :py:class:`numpy.dtype` or Riptable dtype, optional The desired data type for the array. The default is the data type that - would result from creating a `FastArray` with the specified `fill_value`: - ``rt.FastArray(fill_value).dtype``. + would result from creating a :py:class:`~.rt_fastarray.FastArray` with the + specified ``fill_value``: ``rt.FastArray(fill_value).dtype``. order : {'C', 'F'}, default 'C' Whether to store multi-dimensional data in row-major (C-style) or column-major (Fortran-style) order in memory. Returns ------- - `FastArray` - A new `FastArray` of the specified shape and type, filled with the - specified value. + :py:class:`~.rt_fastarray.FastArray` + A new :py:class:`~.rt_fastarray.FastArray` of the specified shape and type, + filled with the specified value. See Also -------- - Categorical.full, riptable.ones, riptable.ones_like, riptable.zeros, - riptable.zeros_like, riptable.empty, riptable.empty_like + :py:meth:`.rt_categorical.Categorical.full` + :py:func:`.rt_numpy.ones` + :py:func:`.rt_numpy.ones_like` + :py:func:`.rt_numpy.zeros` + :py:func:`.rt_numpy.zeros_like` + :py:func:`.rt_numpy.empty` + :py:func:`.rt_numpy.empty_like` Examples -------- @@ -2528,7 +2578,7 @@ def full_like( working with them. fill_value : scalar or array_like Fill value. - dtype : str or NumPy dtype or Riptable dtype, optional + dtype : str or :py:class:`numpy.dtype` or Riptable dtype, optional Overrides the data type of the result. order : {'C', 'F', 'A', or 'K'}, default 'K' Overrides the memory layout of the result. 'C' means row-major (C-style), @@ -2605,27 +2655,32 @@ def ones(shape, dtype=None, order="C", *, like=None) -> "FastArray": Shape of the new array, e.g., ``(2, 3)`` or ``2``. Note that although multi-dimensional arrays are technically supported by Riptable, you may get unexpected results when working with them. - dtype : str or NumPy dtype or Riptable dtype, default `numpy.float64` + dtype : str or :py:class:`numpy.dtype` or Riptable dytpe, default :py:obj:`numpy.float64` The desired data type for the array. - order: {'C', 'F'}, default 'C' + order : {'C', 'F'}, default 'C' Whether to store multi-dimensional data in row-major (C-style) or column-major (Fortran-style) order in memory. - like : array_like, default None + like : array_like, default `None` Reference object to allow the creation of arrays that are not NumPy - arrays. If an array-like passed in as `like` supports the - ``__array_function__`` protocol, the result will be defined by it. + arrays. If an array-like passed in as ``like`` supports the + ``__array_function__`` protocol, the result is defined by it. In this case, it ensures the creation of an array object compatible with that passed in via this argument. Returns ------- - `FastArray` - A new `FastArray` of the specified shape and type, filled with ones. + :py:class:`~.rt_fastarray.FastArray` + A new :py:class:`~.rt_fastarray.FastArray` of the specified shape and type, + filled with ones. See Also -------- - riptable.ones_like, riptable.zeros, riptable.zeros_like, - riptable.empty, riptable.empty_like, riptable.full + :py:func:`.rt_numpy.ones_like` + :py:func:`.rt_numpy.zeros` + :py:func:`.rt_numpy.zeros_like` + :py:func:`.rt_numpy.empty` + :py:func:`.rt_numpy.empty_like` + :py:func:`.rt_numpy.full` Examples -------- @@ -2645,37 +2700,41 @@ def ones_like(a, dtype=None, order="K", subok=True, shape=None) -> "FastArray": Parameters ---------- a : array - The shape and data type of `a` define the same attributes of the + The shape and data type of ``a`` define the same attributes of the returned array. Note that although multi-dimensional arrays are technically supported by Riptable, you may get unexpected results when working with them. - dtype : str or NumPy dtype or Riptable dtype, optional + dtype : str or :py:class:`numpy.dtype` or Riptable dtype, optional Overrides the data type of the result. order : {'C', 'F', 'A', or 'K'}, default 'K' Overrides the memory layout of the result. 'C' means row-major (C-style), - 'F' means column-major (Fortran-style), 'A' means 'F' if `a` is - Fortran-contiguous, 'C' otherwise. 'K' means match the layout of `a` as + 'F' means column-major (Fortran-style), 'A' means 'F' if ``a`` is + Fortran-contiguous, 'C' otherwise. 'K' means match the layout of ``a`` as closely as possible. - subok : bool, default True - If True (the default), then the newly created array will use the sub-class - type of `a`, otherwise it will be a base-class array. + subok : bool, default `True` + If `True` (the default), then the newly created array uses the sub-class + type of ``a``, otherwise it is a base-class array. shape : int or sequence of int, optional - Overrides the shape of the result. If order='K' and the number of - dimensions is unchanged, it will try to keep the same order; otherwise, - order='C' is implied. Note that although multi-dimensional arrays are + Overrides the shape of the result. If ``order='K'`` and the number of + dimensions is unchanged, it tries to keep the same order; otherwise, + ``order='C'`` is implied. Note that although multi-dimensional arrays are technically supported by Riptable, you may get unexpected results when working with them. Returns ------- - `FastArray` - A `FastArray` with the same shape and data type as the specified array, - filled with ones. + :py:class:`~.rt_fastarray.FastArray` + A :py:class:`~.rt_fastarray.FastArray` with the same shape and data type as the + specified array, filled with ones. See Also -------- - riptable.ones, riptable.zeros, riptable.zeros_like, riptable.empty, - riptable.empty_like, riptable.full + :py:func:`.rt_numpy.ones` + :py:func:`.rt_numpy.zeros` + :py:func:`.rt_numpy.zeros_like` + :py:func:`.rt_numpy.empty` + :py:func:`.rt_numpy.empty_like` + :py:func:`.rt_numpy.full` Examples -------- @@ -2700,30 +2759,32 @@ def zeros(shape, dtype=None, order="C", *, like=None) -> "FastArray": Shape of the new array, e.g., ``(2, 3)`` or ``2``. Note that although multi-dimensional arrays are technically supported by Riptable, you may get unexpected results when working with them. - - dtype : str or NumPy dtype or Riptable dtype, default `numpy.float64` + dtype : str or :py:class:`numpy.dtype` or Riptable dtype, default :py:obj:`numpy.float64` The desired data type for the array. - order : {'C', 'F'}, default 'C' Whether to store multi-dimensional data in row-major (C-style) or column-major (Fortran-style) order in memory. - - like : array_like, default None + like : array_like, default `None` Reference object to allow the creation of arrays that are not NumPy - arrays. If an array-like passed in as `like` supports the - ``__array_function__`` protocol, the result will be defined by it. + arrays. If an array-like passed in as ``like`` supports the + ``__array_function__`` protocol, the result is defined by it. In this case, it ensures the creation of an array object compatible with that passed in via this argument. Returns ------- - `FastArray` - A new `FastArray` of the specified shape and type, filled with zeros. + :py:class:`~.rt_fastarray.FastArray` + A new :py:class:`~.rt_fastarray.FastArray` of the specified shape and type, + filled with zeros. See Also -------- - riptable.zeros_like, riptable.ones, riptable.ones_like, riptable.empty, - riptable.empty_like, riptable.full + :py:func:`.rt_numpy.zeros_like` + :py:func:`.rt_numpy.ones` + :py:func:`.rt_numpy.ones_like` + :py:func:`.rt_numpy.empty` + :py:func:`.rt_numpy.empty_like` + :py:func:`.rt_numpy.full` Examples -------- @@ -2748,37 +2809,41 @@ def zeros_like(a, dtype=None, order="k", subok=True, shape=None) -> "FastArray": Parameters ---------- a : array - The shape and data type of `a` define the same attributes of the + The shape and data type of ``a`` define the same attributes of the returned array. Note that although multi-dimensional arrays are technically supported by Riptable, you may get unexpected results when working with them. - dtype : str or NumPy dtype or Riptable dtype, optional + dtype : str or :py:class:`numpy.dtype` or Riptable dtype, optional Overrides the data type of the result. order : {'C', 'F', 'A', or 'K'}, default 'K' Overrides the memory layout of the result. 'C' means row-major (C-style), - 'F' means column-major (Fortran-style), 'A' means 'F' if `a` is - Fortran-contiguous, 'C' otherwise. 'K' means match the layout of `a` as + 'F' means column-major (Fortran-style), 'A' means 'F' if ``a`` is + Fortran-contiguous, 'C' otherwise. 'K' means match the layout of ``a`` as closely as possible. - subok : bool, default True - If True (the default), then the newly created array will use the sub-class - type of `a`, otherwise it will be a base-class array. + subok : bool, default `True` + If `True` (the default), then the newly created array uses the sub-class + type of ``a``, otherwise it is a base-class array. shape : int or sequence of int, optional - Overrides the shape of the result. If order='K' and the number of - dimensions is unchanged, it will try to keep the same order; otherwise, - order='C' is implied. Note that although multi-dimensional arrays are + Overrides the shape of the result. If ``order='K'`` and the number of + dimensions is unchanged, it tries to keep the same order; otherwise, + ``order='C'`` is implied. Note that although multi-dimensional arrays are technically supported by Riptable, you may get unexpected results when working with them. Returns ------- - `FastArray` - A `FastArray` with the same shape and data type as the specified array, - filled with zeros. + :py:class:`~.rt_fastarray.FastArray` + A :py:class:`~.rt_fastarray.FastArray` with the same shape and data type as the + specified array, filled with zeros. See Also -------- - riptable.zeros, riptable.ones, riptable.ones_like, riptable.empty, - riptable.empty_like, riptable.full + :py:func:`.rt_numpy.zeros` + :py:func:`.rt_numpy.ones` + :py:func:`.rt_numpy.ones_like` + :py:func:`.rt_numpy.empty` + :py:func:`.rt_numpy.empty_like` + :py:func:`.rt_numpy.full` Examples -------- @@ -2786,8 +2851,8 @@ def zeros_like(a, dtype=None, order="k", subok=True, shape=None) -> "FastArray": >>> rt.zeros_like(a) FastArray([0, 0, 0, 0]) - >>> rt.zeros_like(a, dtype = float) - FastArray([1., 1., 1., 1.]) + >>> rt.zeros_like(a, dtype=float) + FastArray([0., 0., 0., 0.]) """ return LedgerFunction(np.zeros_like, a, dtype=dtype, order=order, subok=subok, shape=shape) @@ -2822,65 +2887,70 @@ def transpose(*args, **kwargs): # ------------------------------------------------------- def where(condition, x=None, y=None) -> FastArray | tuple[FastArray, ...]: """ - Return a new `FastArray` or `Categorical` with elements from `x` or `y` - depending on whether `condition` is True. + Return a new :py:class:`~.rt_fastarray.FastArray` or + :py:class:`~.rt_categorical.Categorical` with elements from ``x`` or ``y`` + depending on whether ``condition`` is `True`. For 1-dimensional arrays, this function is equivalent to:: [xv if c else yv for c, xv, yv in zip(condition, x, y)] - If only `condition` is provided, this function returns a tuple containing - an integer `FastArray` with the indices where the condition is True. Note - that this usage of `where` is not supported for `FastArray` objects of more - than one dimension. + If only ``condition`` is provided, this function returns a tuple containing + an integer :py:class:`~.rt_fastarray.FastArray` with the indices where ``condition`` + is `True`. Note that this usage of :py:func:`~.rt_numpy.where` is not supported for + :py:class:`~.rt_fastarray.FastArray` objects of more than one dimension. - Note also that this case of `where` uses `.riptable.bool_to_fancy()`. Using - `bool_to_fancy` directly is preferred, as it behaves correctly for - subclasses. + Note also that this case of :py:func:`~.rt_numpy.where` uses + :py:func:`~.rt_numpy.bool_to_fancy`. Using :py:func:`~.rt_numpy.bool_to_fancy` + directly is preferred, as it behaves correctly for subclasses. Parameters ---------- condition : bool or array of bool - Where True, yield `x`, otherwise yield `y`. + Where `True`, yield ``x``, otherwise yield ``y``. x : scalar, array, or callable, optional - The value to use where `condition` is True. If `x` is provided, `y` - must also be provided, and `x` and `y` should be the same type. If `x` - is an array, a callable that returns an array, or a `Categorical`, it - must be the same length as `condition`. The value of `x` that corresponds - to the True value is used. + The value to use where ``condition`` is `True`. If ``x`` is provided, ``y`` + must also be provided, and ``x`` and ``y`` should be the same type. If ``x`` + is an array, a callable that returns an array, or a + :py:class:`~.rt_categorical.Categorical`, it must be the same length as + ``condition``. The value of ``x`` that corresponds to the `True` value is used. y : scalar, array, or callable, optional - The value to use where `condition` is False. If `y` is provided, `x` - must also be provided, and `x` and `y` should be the same type. If `y` - is an array, a callable that returns an array, or a `Categorical`, it - must be the same length as `condition`. The value of `y` that corresponds - to the False value is used. + The value to use where ``condition`` is `False`. If ``y`` is provided, ``x`` + must also be provided, and ``x`` and ``y`` should be the same type. If ``y`` + is an array, a callable that returns an array, or a + :py:class:`~.rt_categorical.Categorical`, it must be the same length as + ``condition``. The value of ``y`` that corresponds to the `False` value is used. Returns ------- - FastArray or Categorical or tuple - If `x` and `y` are `Categorical` objects, a `Categorical` is returned. - Otherwise, if `x` and `y` are provided a `FastArray` is returned. When - only `condition` is provided, a tuple is returned containing an integer - `FastArray` with the indices where the condition is True. + :py:class:`~.rt_fastarray.FastArray` or :py:class:`~.rt_categorical.Categorical` or tuple + If ``x`` and ``y`` are :py:class:`~.rt_categorical.Categorical` objects, a + :py:class:`~.rt_categorical.Categorical` is returned. Otherwise, if ``x`` and + ``y`` are provided a :py:class:`~.rt_fastarray.FastArray` is returned. When + only ``condition`` is provided, a tuple is returned containing an integer + :py:class:`~.rt_fastarray.FastArray` with the indices where the condition is + `True`. See Also -------- - .FastArray.where : Replace values where a given condition is False. - riptable.bool_to_fancy : The function called when `x` and `y` are omitted. + :py:meth:`.rt_fastarray.FastArray.where` : + Replace values where a given condition is `False`. + :py:func:`.rt_numpy.bool_to_fancy` : + The function called when ``x`` and ``y`` are omitted. Examples -------- - `condition` is a comparison that creates an array of booleans, and `x` - and `y` are scalars: + ``condition`` is a comparison that creates an array of booleans, and ``x`` + and ``y`` are scalars: >>> a = rt.FastArray(rt.arange(5)) >>> a FastArray([0, 1, 2, 3, 4]) >>> rt.where(a < 2, 100, 200) - FastArray([100, 100, 200, 200, 200]) + FastArray([100, 100, 200, 200, 200], dtype=uint8) - `condition` and `x` are same-length arrays, and `y` is a + ``condition`` and ``x`` are same-length arrays, and ``y`` is a scalar: >>> condition = rt.FastArray([False, False, True, True, True]) @@ -2889,7 +2959,8 @@ def where(condition, x=None, y=None) -> FastArray | tuple[FastArray, ...]: >>> rt.where(condition, x, y) FastArray([200, 200, 102, 103, 104]) - When `x` and `y` are `Categorical` objects, a `Categorical` is returned: + When ``x`` and ``y`` are :py:class:`~.rt_categorical.Categorical` objects, a + :py:class:`~.rt_categorical.Categorical` is returned: >>> primary_traders = rt.Cat(['John', 'Mary', 'John', 'Mary', 'John', 'Mary']) >>> secondary_traders = rt.Cat(['Chris', 'Duncan', 'Chris', 'Duncan', 'Duncan', 'Chris']) @@ -2899,23 +2970,27 @@ def where(condition, x=None, y=None) -> FastArray | tuple[FastArray, ...]: FastArray([3, 4, 1, 4, 2, 4], dtype=int8) Base Index: 1 FastArray([b'Chris', b'Duncan', b'John', b'Mary'], dtype='|S6') Unique count: 4 - When `x` and `y` are `Date` objects, a `FastArray` of integers is returned - that can be converted to a `Date` (other datetime objects are similar): + When ``x`` and ``y`` are :py:class:`~.rt_datetime.Date` objects, a + :py:class:`~.rt_fastarray.FastArray` of integers is returned that can be converted + to a :py:class:`~.rt_datetime.Date` (other :py:obj:`.rt_datetime` objects are similar): >>> x = rt.Date(['20230101', '20220101', '20210101']) >>> y = rt.Date(['20190101', '20180101', '20170101']) >>> condition = x > rt.Date(['20211231']) >>> rt.where(condition, x, y) - >>> FastArray([19358, 18993, 17167]) + FastArray([19358, 18993, 17167], dtype=int32) + >>> rt.FastArray([19358, 18993, 17167]) + FastArray([19358, 18993, 17167]) >>> rt.Date(_) Date(['2023-01-01', '2022-01-01', '2017-01-01']) - When only a condition is provided, a tuple is returned containing a `FastArray` - with the indices where the condition is True: + When only a ``condition`` is provided, a tuple is returned containing a + :py:class:`~.rt_fastarray.FastArray` with the indices where the ``condition`` is + `True`: >>> a = rt.FastArray([10, 20, 30, 40, 50]) >>> rt.where(a < 40) - (FastArray([0, 1, 2]),) + (FastArray([0, 1, 2], dtype=int32),) """ if isinstance(x, TypeRegister.Categorical) and isinstance(y, TypeRegister.Categorical): z = TypeRegister.Categorical.hstack([x, y]) @@ -3184,45 +3259,64 @@ def sum(*args, filter=None, dtype=None, **kwargs) -> np.number | Dataset: When possible, ``rt.sum(x, *args)`` calls ``x.sum(*args)``; look there for documentation. In particular, note whether the called function accepts the - keyword arguments listed below. For example, `Dataset.sum()` does not accept - the `filter` or `dtype` keyword arguments. + keyword arguments listed below. For example, :py:meth:`.rt_dataset.Dataset.sum` + does not accept the ``filter`` or ``dtype`` keyword arguments. - For ``FastArray.sum``, see `numpy.sum` for documentation but note the following: + When a :py:class:`~.rt_fastarray.FastArray` is passed to :py:func:`.rt_numpy.sum`, + :py:func:`numpy.sum` is called. See the documentation for :py:func:`numpy.sum`, but + note the following: - * Until a reported bug is fixed, the `dtype` keyword argument may not work + * Until a reported bug is fixed, the ``dtype`` keyword argument may not work as expected: - * Riptable data types (for example, `rt.float64`) are ignored. - * NumPy integer data types (for example, `numpy.int32`) are also ignored. - * NumPy floating point data types are applied as `numpy.float64`. + * Riptable data types (for example, :py:obj:`.rt_numpy.float64`) are ignored. + * NumPy integer data types (for example, :py:obj:`numpy.int32`) are also ignored. + * NumPy floating point data types are applied as :py:obj:`numpy.float64`. - * If you include another NumPy parameter (for example, ``axis=0``), the NumPy - implementation of ``sum`` will be used and the ``dtype`` will be used to - compute the sum. + * If you include another NumPy parameter (for example, ``axis=0``), :py:func:`numpy.sum` + is used and the ``dtype`` is used to compute the sum. Parameters ---------- - filter : array of bool, default None + *args : array or iterable or scalar value + Contains the values that are used to calculate the sum. + filter : array of bool, default `None` Specifies which elements to include in the sum calculation. - dtype : rt.dtype or numpy.dtype, optional - The data type of the result. By default, for integer input the result `dtype` is - ``int64`` and for floating point input the result `dtype` is ``float64``. See - the notes above about using this keyword argument with `FastArray` objects - as input. + dtype : :py:class:`numpy.dtype` or Riptable dtype, optional + The data type of the result. If not specified, the default ``dtype`` depends on + the input values. For example: + + - For a :py:class:`~.rt_fastarray.FastArray` with `int` values, the resulting + ``dtype`` is :py:obj:`numpy.int64`. + - For a :py:class:`~.rt_fastarray.FastArray` with `float` values, the + resulting ``dtype`` is :py:obj:`numpy.float64`. + - For a list with `int` values, the resulting ``dtype`` is `int`. + - For a list with `float` values, the resulting ``dtype`` is `float`. + + See the notes above about using this keyword argument with + :py:class:`~.rt_fastarray.FastArray` objects as input. + **kwargs : + Additional keyword arguments to be passed to the function. See + :py:func:`numpy.sum` for additional keyword arguments. Returns ------- - scalar or `Dataset` - Scalar for `FastArray` input. For `Dataset` input, returns a `Dataset` + scalar or :py:class:`~.rt_dataset.Dataset` + Scalar for :py:class:`~.rt_fastarray.FastArray` input. For + :py:class:`~.rt_dataset.Dataset` input, returns a :py:class:`~.rt_dataset.Dataset` consisting of a row with each numerical column's sum. See Also -------- - numpy.sum - nansum : Sums the values, ignoring NaNs. - FastArray.sum : Sums the values of a `FastArray`. - Dataset.sum : Sums the values of numerical `Dataset` columns. - GroupByOps.sum : Sums the values of each group. Used by `Categorical` objects. + :py:func:`numpy.sum` : + Sum of array elements over a given axis. + :py:func:`.rt_numpy.nansum` : + Sums the values, ignoring ``NaN`` values. + :py:meth:`.rt_dataset.Dataset.sum` : + Sums the values of numerical :py:class:`~.rt_dataset.Dataset` columns. + :py:meth:`.rt_groupbyops.GroupByOps.sum` : + Sums the values of each group. Used by :py:class:`~.rt_categorical.Categorical` + objects. Examples -------- @@ -3244,45 +3338,59 @@ def sum(*args, filter=None, dtype=None, **kwargs) -> np.number | Dataset: # ------------------------------------------------------- def nansum(*args, filter=None, dtype=None, **kwargs) -> np.number | Dataset: """ - Compute the sum of the values in the first argument, ignoring NaNs. + Compute the sum of the values in the first argument, ignoring ``NaN`` values. - If all values in the first argument are NaNs, ``0.0`` is returned. + If all values in the first argument are ``NaN`` values, ``0.0`` is returned. When possible, ``rt.nansum(x, *args)`` calls ``x.nansum(*args)``; look there for documentation. In particular, note whether the called function accepts the keyword arguments listed below. - For example, `FastArray.nansum` accepts the `filter` and `dtype` keyword arguments, - but `Dataset.nansum` does not. + For example, :py:meth:`.rt_fastarray.FastArray.nansum` accepts the ``filter`` and + ``dtype`` keyword arguments, but :py:meth:`.rt_dataset.Dataset.nansum` does not. Parameters ---------- - filter : array of bool, default None + *args : array or iterable or scalar value + Contains the values that are used to calculate the sum. + filter : array of bool, default `None` Specifies which elements to include in the sum calculation. If the filter is - uniformly ``False``, `rt.nansum` returns ``0.0``. - dtype : rt.dtype or numpy.dtype, default float64 - The data type of the result. For a `FastArray` ``x``, + uniformly `False`, the method returns ``0.0``. + dtype : :py:class:`numpy.dtype` or Riptable dtype, default :py:obj:`numpy.float64` + The data type of the result. For a :py:class:`~.rt_fastarray.FastArray` ``x``, ``x.nansum(dtype = my_type)`` is equivalent to ``my_type(x.nansum())``. + **kwargs : + Additional keyword arguments to be passed to the function. See + :py:func:`numpy.nansum` for additional keyword arguments. Returns ------- - scalar or `Dataset` - Scalar for `FastArray` input. For `Dataset` input, returns a `Dataset` + scalar or :py:class:`~.rt_dataset.Dataset` + Scalar for :py:class:`~.rt_fastarray.FastArray` input. For + :py:class:`~.rt_dataset.Dataset` input, returns a :py:class:`~.rt_dataset.Dataset` consisting of a row with each numerical column's sum. See Also -------- - sum : Sums the values of the input. - FastArray.nansum : Sums the values of a `FastArray`, ignoring NaNs. - Dataset.nansum : Sums the values of numerical `Dataset` columns, ignoring NaNs. - GroupByOps.nansum : Sums the values of each group, ignoring NaNs. Used by - `Categorical` objects. + :py:func:`numpy.nansum` : + Return the sum of array elements over a given axis treating Not a Numbers + (``NaN``) as zero. + :py:func:`.rt_numpy.sum` : + Sums the values of the input. + :py:meth:`.rt_fastarray.FastArray.nansum` : + Sums the values of a :py:class:`~.rt_fastarray.FastArray`, ignoring ``NaN`` values. + :py:meth:`.rt_dataset.Dataset.nansum` : + Sums the values of numerical :py:class:`~.rt_dataset.Dataset` columns, ignoring + ``NaN`` values. + :py:meth:`.rt_groupbyops.GroupByOps.nansum` : + Sums the values of each group, ignoring ``NaN`` values. Used by + :py:class:`~.rt_categorical.Categorical` objects. Notes ----- - The `dtype` keyword for `rt.nansum` specifies the data type of the result. This - differs from `numpy.nansum`, where it specifies the data type used to compute - the sum. + The ``dtype`` parameter specifies the data type of the result. This + differs from :py:func:`numpy.nansum`, where it specifies the data type used to + compute the sum. Examples -------- @@ -3290,13 +3398,13 @@ def nansum(*args, filter=None, dtype=None, **kwargs) -> np.number | Dataset: >>> rt.nansum(a) 16.0 - With a `dtype` specified: + With a ``dtype`` specified: >>> a = rt.FastArray([1.0, 3.0, 5.0, 7.0, rt.nan]) >>> rt.nansum(a, dtype = rt.int32) 16 - With a filter: + With a ``filter``: >>> a = rt.FastArray([1, 3, 5, 7, rt.nan]) >>> b = rt.FastArray([False, True, False, True, True]) @@ -3496,36 +3604,48 @@ def mean(*args, filter=None, dtype=None, **kwargs) -> np.number | Dataset: documentation. In particular, note whether the called function accepts the keyword arguments listed below. - For example, `FastArray.mean` accepts the `filter` and `dtype` keyword arguments, - but `Dataset.mean` does not. + For example, :py:meth:`.rt_fastarray.FastArray.mean` accepts the ``filter`` and + ``dtype`` keyword arguments, but :py:meth:`.rt_dataset.Dataset.mean` does not. Parameters ---------- - filter : array of bool, default None + *args : array or iterable or scalar value + Contains the values that are used to calculate the mean. + filter : array of bool, default `None` Specifies which elements to include in the mean calculation. If the filter is - uniformly ``False``, `rt.mean` returns a `ZeroDivisionError`. - dtype : rt.dtype or numpy.dtype, default float64 - The data type of the result. For a `FastArray` ``x``, + uniformly `False`, :py:func:`~.rt_numpy.mean` returns a :py:class:`ZeroDivisionError`. + dtype : :py:class:`numpy.dtype` or Riptable dtype, default :py:obj:`numpy.float64` + The data type of the result. For a :py:class:`~.rt_fastarray.FastArray` ``x``, ``x.mean(dtype = my_type)`` is equivalent to ``my_type(x.mean())``. + **kwargs : + Additional keyword arguments to be passed to the function. See + :py:func:`numpy.mean` for additional keyword arguments. Returns ------- - scalar or `Dataset` - Scalar for `FastArray` input. For `Dataset` input, returns a `Dataset` + scalar or :py:class:`~.rt_dataset.Dataset` + Scalar for :py:class:`~.rt_fastarray.FastArray` input. For + :py:class:`~.rt_dataset.Dataset` input, returns a :py:class:`~.rt_dataset.Dataset` consisting of a row with each numerical column's mean. See Also -------- - nanmean : Computes the mean, ignoring NaNs. - Dataset.mean : Computes the mean of numerical `Dataset` columns. - FastArray.mean : Computes the mean of `FastArray` values. - GroupByOps.mean : Computes the mean of each group. Used by `Categorical` objects. + :py:func:`numpy.mean` : + Computes the arithmetic mean along the specified axis. + :py:func:`.rt_numpy.nanmean` : + Computes the mean, ignoring ``NaN`` values. + :py:meth:`.rt_dataset.Dataset.mean` : + Computes the mean of numerical :py:class:`~.rt_dataset.Dataset` columns. + :py:meth:`.rt_fastarray.FastArray.mean` : + Computes the mean of :py:class:`~.rt_fastarray.FastArray` values. + :py:meth:`.rt_groupbyops.GroupByOps.mean` : + Computes the mean of each group. Used by :py:class:`~.rt_categorical.Categorical` + objects. Notes ----- - The `dtype` keyword for `rt.mean` specifies the data type of the result. This - differs from `numpy.mean`, where it specifies the data type used to compute - the mean. + The ``dtype`` parameter specifies the data type of the result. This differs from + :py:func:`numpy.mean`, where it specifies the data type used to compute the mean. Examples -------- @@ -3533,13 +3653,13 @@ def mean(*args, filter=None, dtype=None, **kwargs) -> np.number | Dataset: >>> rt.mean(a) 4.0 - With a `dtype` specified: + With a ``dtype`` specified: >>> a = rt.FastArray([1, 3, 5, 7]) >>> rt.mean(a, dtype = rt.int32) 4 - With a filter: + With a ``filter``: >>> a = rt.FastArray([1, 3, 5, 7]) >>> b = rt.FastArray([False, True, False, True]) @@ -3556,45 +3676,58 @@ def mean(*args, filter=None, dtype=None, **kwargs) -> np.number | Dataset: # ------------------------------------------------------- def nanmean(*args, filter=None, dtype=None, **kwargs) -> np.number | Dataset: """ - Compute the arithmetic mean of the values in the first argument, ignoring NaNs. + Compute the arithmetic mean of the values in the first argument, ignoring ``NaN`` + values. - If all values in the first argument are NaNs, ``0.0`` is returned. + If all values in the first argument are ``NaN`` values, ``0.0`` is returned. When possible, ``rt.nanmean(x, *args)`` calls ``x.nanmean(*args)``; look there for documentation. In particular, note whether the called function accepts the keyword arguments listed below. - For example, `FastArray.nanmean` accepts the `filter` and `dtype` keyword arguments, - but `Dataset.nanmean` does not. + For example, :py:meth:`.rt_fastarray.FastArray.nanmean` accepts the ``filter`` and + ``dtype`` keyword arguments, but :py:meth:`.rt_dataset.Dataset.nanmean` does not. Parameters ---------- - filter : array of bool, default None + *args : array or iterable or scalar value + Contains the values that are used to calculate the mean. + filter : array of bool, default `None` Specifies which elements to include in the mean calculation. If the filter is - uniformly ``False``, `rt.nanmean` returns a `ZeroDivisionError`. - dtype : rt.dtype or numpy.dtype, default float64 - The data type of the result. For a `FastArray` ``x``, + uniformly `False`, the method returns a :py:class:`ZeroDivisionError`. + dtype : :py:class:`numpy.dtype` or Riptable dtype, default :py:obj:`numpy.float64` + The data type of the result. For a :py:class:`~.rt_fastarray.FastArray` ``x``, ``x.nanmean(dtype = my_type)`` is equivalent to ``my_type(x.nanmean())``. + **kwargs : + Additional keyword arguments to be passed to the function. See + :py:func:`numpy.nanmean` for additional keyword arguments. Returns ------- - scalar or `Dataset` - Scalar for `FastArray` input. For `Dataset` input, returns a `Dataset` + scalar or :py:class:`~.rt_dataset.Dataset` + Scalar for :py:class:`~.rt_fastarray.FastArray` input. For + :py:class:`~.rt_dataset.Dataset` input, returns a :py:class:`~.rt_dataset.Dataset` consisting of a row with each numerical column's mean. See Also -------- - mean : Computes the mean. - Dataset.nanmean : Computes the mean of numerical `Dataset` columns, ignoring NaNs. - FastArray.nanmean : Computes the mean of `FastArray` values, ignoring NaNs. - GroupByOps.nanmean : Computes the mean of each group, ignoring NaNs. Used by - `Categorical` objects. + :py:func:`numpy.nanmean` : + Compute the arithmetic mean along the specified axis, ignoring ``NaN`` values. + :py:func:`.rt_numpy.mean` : + Computes the mean. + :py:meth:`.rt_dataset.Dataset.nanmean` : + Computes the mean of numerical :py:class:`~.rt_dataset.Dataset` columns, + ignoring ``NaN`` values. + :py:meth:`.rt_fastarray.FastArray.nanmean` : + Computes the mean of :py:class:`~.rt_fastarray.FastArray` values, ignoring ``NaN`` values. + :py:meth:`.rt_groupbyops.GroupByOps.nanmean` : + Computes the mean of each group, ignoring ``NaN`` values. Used by + :py:class:`~.rt_categorical.Categorical` objects. Notes ----- - The `dtype` keyword for `rt.nanmean` specifies the data type of the result. This - differs from `numpy.nanmean`, where it specifies the data type used to compute - the mean. + The ``dtype`` parameter specifies the data type of the result. This differs from + :py:func:`numpy.nanmean`, where it specifies the data type used to compute the mean. Examples -------- @@ -3602,13 +3735,13 @@ def nanmean(*args, filter=None, dtype=None, **kwargs) -> np.number | Dataset: >>> rt.nanmean(a) 3.0 - With a `dtype` specified: + With a ``dtype`` specified: >>> a = rt.FastArray([1, 3, 5, rt.nan]) >>> rt.nanmean(a, dtype = rt.int32) 3 - With a filter: + With a ``filter``: >>> a = rt.FastArray([1, 3, 5, rt.nan]) >>> b = rt.FastArray([False, True, True, True]) @@ -3652,36 +3785,49 @@ def var(*args, filter=None, dtype=None, **kwargs) -> np.number | Dataset: documentation. In particular, note whether the called function accepts the keyword arguments listed below. - For example, `FastArray.var` accepts the `filter` and `dtype` keyword arguments, - but `Dataset.var` does not. + For example, :py:meth:`.rt_fastarray.FastArray.var` accepts the ``filter`` and + ``dtype`` keyword arguments, but :py:meth:`.rt_dataset.Dataset.var` does not. Parameters ---------- - filter : array of bool, default None - Specifies which elements to include in the variance calculation. If the filter - is uniformly ``False``, `rt.var` returns a `ZeroDivisionError`. - - dtype : rt.dtype or numpy.dtype, default float64 - The data type of the result. For a `FastArray` ``x``, + *args : array or iterable or scalar value + Contains the values that are used to calculate the variance. + filter : array of bool, default `None` + Specifies which elements to include in the variance calculation. If the ``filter`` + is uniformly `False`, the method returns a :py:class:`ZeroDivisionError`. + dtype : :py:class:`numpy.dtype` or Riptable dtype, default :py:obj:`numpy.float64` + The data type of the result. For a :py:class:`~.rt_fastarray.FastArray` ``x``, ``x.var(dtype = my_type)`` is equivalent to ``my_type(x.var())``. + **kwargs : + Additional keyword arguments to be passed to the function. See + :py:func:`numpy.var` for additional keyword arguments. Returns ------- - scalar or `Dataset` - Scalar for `FastArray` input. For `Dataset` input, returns a `Dataset` + scalar or :py:class:`~.rt_dataset.Dataset` + Scalar for :py:class:`~.rt_fastarray.FastArray` input. For + :py:class:`~.rt_dataset.Dataset` input, returns a :py:class:`~.rt_dataset.Dataset` consisting of a row with each numerical column's variance. See Also -------- - nanvar : Computes the variance, ignoring NaNs. - FastArray.var : Computes the variance of `FastArray` values. - Dataset.var : Computes the variance of numerical `Dataset` columns. - GroupByOps.var : Computes the variance of each group. Used by `Categorical` objects. + :py:func:`numpy.var` : + Compute the variance along the specified axis. + :py:func:`.rt_numpy.nanvar` : + Computes the variance, ignoring ``NaN`` values. + :py:meth:`.rt_fastarray.FastArray.var` : + Computes the variance of :py:class:`~.rt_fastarray.FastArray` values. + :py:meth:`.rt_dataset.Dataset.var` : + Computes the variance of numerical :py:class:`~.rt_dataset.Dataset` columns. + :py:meth:`.rt_groupbyops.GroupByOps.var` : + Computes the variance of each group. Used by + :py:class:`~.rt_categorical.Categorical` objects. Notes ----- - The `dtype` keyword for `rt.var` specifies the data type of the result. This differs - from `numpy.var`, where it specifies the data type used to compute the variance. + The ``dtype`` parameter specifies the data type of the result. This differs + from :py:func:`numpy.var`, where it specifies the data type used to compute the + variance. Examples -------- @@ -3689,13 +3835,13 @@ def var(*args, filter=None, dtype=None, **kwargs) -> np.number | Dataset: >>> rt.var(a) 1.0 - With a `dtype` specified: + With a ``dtype`` specified: >>> a = rt.FastArray([1, 2, 3]) >>> rt.var(a, dtype = rt.int32) 1 - With a filter: + With a ``filter``: >>> a = rt.FastArray([1, 2, 3]) >>> b = rt.FastArray([False, True, True]) @@ -3712,9 +3858,9 @@ def var(*args, filter=None, dtype=None, **kwargs) -> np.number | Dataset: # ------------------------------------------------------- def nanvar(*args, filter=None, dtype=None, **kwargs) -> np.number | Dataset: """ - Compute the variance of the values in the first argument, ignoring NaNs. + Compute the variance of the values in the first argument, ignoring ``NaN`` values. - If all values in the first argument are NaNs, ``NaN`` is returned. + If all values in the first argument are ``NaN`` values, ``NaN`` is returned. Riptable uses the convention that ``ddof = 1``, meaning the variance of ``[x_1, ..., x_n]`` is defined by ``var = 1/(n - 1) * sum(x_i - mean )**2`` (note @@ -3725,38 +3871,51 @@ def nanvar(*args, filter=None, dtype=None, **kwargs) -> np.number | Dataset: documentation. In particular, note whether the called function accepts the keyword arguments listed below. - For example, `FastArray.nanvar` accepts the `filter` and `dtype` keyword arguments, - but `Dataset.nanvar` does not. + For example, :py:meth:`.rt_fastarray.FastArray.nanvar` accepts the ``filter`` and + ``dtype`` keyword arguments, but :py:meth:`.rt_dataset.Dataset.nanvar` does not. Parameters ---------- - filter : array of bool, default None + *args : array or iterable or scalar value + Contains the values that are used to calculate the variance. + filter : array of bool, default `None` Specifies which elements to include in the variance calculation. If the filter - is uniformly ``False``, `rt.nanvar` returns a `ZeroDivisionError`. - - dtype : rt.dtype or numpy.dtype, default float64 - The data type of the result. For a `FastArray` ``x``, + is uniformly `False`, the method returns a :py:class:`ZeroDivisionError`. + dtype : :py:class:`numpy.dtype` or Riptable dtype, default :py:obj:`numpy.float64` + The data type of the result. For a :py:class:`~.rt_fastarray.FastArray` ``x``, ``x.nanvar(dtype = my_type)`` is equivalent to ``my_type(x.nanvar())``. + **kwargs : + Additional keyword arguments to be passed to the function. See + :py:func:`numpy.nanvar` for additional keyword arguments. Returns ------- - scalar or `Dataset` - Scalar for `FastArray` input. For `Dataset` input, returns a `Dataset` + scalar or :py:class:`~.rt_dataset.Dataset` + Scalar for :py:class:`~.rt_fastarray.FastArray` input. For + :py:class:`~.rt_dataset.Dataset` input, returns a :py:class:`~.rt_dataset.Dataset` consisting of a row with each numerical column's variance. See Also -------- - var : Computes the variance. - FastArray.nanvar : Computes the variance of `FastArray` values, ignoring NaNs. - Dataset.nanvar : Computes the variance of numerical `Dataset` columns, ignoring NaNs. - GroupByOps.nanvar : Computes the variance of each group, ignoring NaNs. Used by - `Categorical` objects. + :py:func:`numpy.nanvar` : + Compute the variance along the specified axis, while ignoring ``NaN`` values. + :py:func:`.rt_numpy.var` : + Computes the variance. + :py:meth:`.rt_fastarray.FastArray.nanvar` : + Computes the variance of :py:class:`~.rt_fastarray.FastArray` values, ignoring + ``NaN`` values. + :py:meth:`.rt_dataset.Dataset.nanvar` : + Computes the variance of numerical :py:class:`~.rt_dataset.Dataset` columns, + ignoring ``NaN`` values. + :py:meth:`.rt_groupbyops.GroupByOps.nanvar` : + Computes the variance of each group, ignoring ``NaN`` values. Used by + :py:class:`~.rt_categorical.Categorical` objects. Notes ----- - The `dtype` keyword for `rt.nanvar` specifies the data type of the - result. This differs from `numpy.nanvar`, where it specifies the data type used to - compute the variance. + The ``dtype`` parameter specifies the data type of the result. This differs from + :py:func:`numpy.nanvar`, where it specifies the data type used to compute the + variance. Examples -------- @@ -3764,13 +3923,13 @@ def nanvar(*args, filter=None, dtype=None, **kwargs) -> np.number | Dataset: >>> rt.nanvar(a) 1.0 - With a `dtype` specified: + With a ``dtype`` specified: >>> a = rt.FastArray([1, 2, 3, rt.nan]) >>> rt.nanvar(a, dtype = rt.int32) 1 - With a filter: + With a ``filter``: >>> a = rt.FastArray([1, 2, 3, rt.nan]) >>> b = rt.FastArray([False, True, True, True]) @@ -3798,38 +3957,50 @@ def std(*args, filter=None, dtype=None, **kwargs): documentation. In particular, note whether the called function accepts the keyword arguments listed below. - For example, `FastArray.std` accepts the `filter` and `dtype` keyword arguments, - but `Dataset.std` does not. + For example, :py:meth:`.rt_fastarray.FastArray.std` accepts the ``filter`` and + ``dtype`` keyword arguments, but :py:meth:`.rt_dataset.Dataset.std` does not. Parameters ---------- - filter : array of bool, default None + *args : array or iterable or scalar value + Contains the values that are used to calculate the standard deviation. + filter : array of bool, default `None` Specifies which elements to include in the standard deviation calculation. If - the filter is uniformly ``False``, `rt.std` returns a `ZeroDivisionError`. - - dtype : rt.dtype or numpy.dtype, default float64 - The data type of the result. For a `FastArray` ``x``, + the filter is uniformly `False`, the method returns a :py:class:`ZeroDivisionError`. + dtype : :py:class:`numpy.dtype` or Riptable dtype, default :py:obj:`numpy.float64` + The data type of the result. For a :py:class:`~.rt_fastarray.FastArray` ``x``, ``x.std(dtype = my_type)`` is equivalent to ``my_type(x.std())``. + **kwargs : + Additional keyword arguments to be passed to the function. See + :py:func:`numpy.std` for additional keyword arguments. Returns ------- - scalar or `Dataset` - Scalar for `FastArray` input. For `Dataset` input, returns a `Dataset` + scalar or :py:class:`~.rt_dataset.Dataset` + Scalar for :py:class:`~.rt_fastarray.FastArray` input. For + :py:class:`~.rt_dataset.Dataset` input, returns a :py:class:`~.rt_dataset.Dataset` consisting of a row with each numerical column's standard deviation. See Also -------- - nanstd : Computes the standard deviation, ignoring NaNs. - FastArray.std : Computes the standard deviation of `FastArray` values. - Dataset.std : Computes the standard deviation of numerical `Dataset` columns. - GroupByOps.std : Computes the standard deviation of each group. Used by - `Categorical` objects. + :py:func:`numpy.std` : + Compute the standard deviation along the specified axis. + :py:func:`.rt_numpy.nanstd` : + Computes the standard deviation, ignoring ``NaN`` values. + :py:meth:`.rt_fastarray.FastArray.std` : + Computes the standard deviation of :py:class:`~.rt_fastarray.FastArray` values. + :py:meth:`.rt_dataset.Dataset.std` : + Computes the standard deviation of numerical :py:class:`~.rt_dataset.Dataset` + columns. + :py:meth:`.rt_groupbyops.GroupByOps.std` : + Computes the standard deviation of each group. Used by + :py:class:`~.rt_categorical.Categorical` objects. Notes ----- - The `dtype` keyword for `rt.std` specifies the data type of the result. This differs - from `numpy.std`, where it specifies the data type used to compute the standard - deviation. + The ``dtype`` parameter specifies the data type of the result. This differs + from :py:func:`numpy.std`, where it specifies the data type used to compute the + standard deviation. Examples -------- @@ -3837,13 +4008,13 @@ def std(*args, filter=None, dtype=None, **kwargs): >>> rt.std(a) 1.0 - With a `dtype` specified: + With a ``dtype`` specified: >>> a = rt.FastArray([1, 2, 3]) >>> rt.std(a, dtype = rt.int32) 1 - With a filter: + With a ``filter``: >>> a = rt.FastArray([1, 2, 3]) >>> b = rt.FA([False, True, True]) @@ -3860,9 +4031,10 @@ def std(*args, filter=None, dtype=None, **kwargs): # ------------------------------------------------------- def nanstd(*args, filter=None, dtype=None, **kwargs) -> np.number | Dataset: """ - Compute the standard deviation of the values in the first argument, ignoring NaNs. + Compute the standard deviation of the values in the first argument, ignoring ``NaN`` + values. - If all values in the first argument are NaNs, ``NaN`` is returned. + If all values in the first argument are ``NaN`` values, ``NaN`` is returned. Riptable uses the convention that ``ddof = 1``, meaning the standard deviation of ``[x_1, ..., x_n]`` is defined by ``std = 1/(n - 1) * sum(x_i - mean )**2`` (note @@ -3873,39 +4045,51 @@ def nanstd(*args, filter=None, dtype=None, **kwargs) -> np.number | Dataset: documentation. In particular, note whether the called function accepts the keyword arguments listed below. - For example, `FastArray.nanstd` accepts the `filter` and `dtype` keyword arguments, - but `Dataset.nanstd` does not. + For example, :py:meth:`.rt_fastarray.FastArray.nanstd` accepts the ``filter`` and + ``dtype`` keyword arguments, but :py:meth:`.rt_dataset.Dataset.nanstd` does not. Parameters ---------- - filter : array of bool, default None + *args : array or iterable or scalar value + Contains the values that are used to calculate the standard deviation. + filter : array of bool, default `None` Specifies which elements to include in the standard deviation calculation. If - the filter is uniformly ``False``, `rt.nanstd` returns a `ZeroDivisionError`. - - dtype : rt.dtype or numpy.dtype, default float64 - The data type of the result. For a `FastArray` ``x``, + the filter is uniformly `False`, the method returns a :py:class:`ZeroDivisionError`. + dtype : :py:class:`numpy.dtype` or Riptable dtype, default :py:obj:`numpy.float64` + The data type of the result. For a :py:class:`~.rt_fastarray.FastArray` ``x``, ``x.nanstd(dtype = my_type)`` is equivalent to ``my_type(x.nanstd())``. + **kwargs : + Additional keyword arguments to be passed to the function. See + :py:func:`numpy.nanstd` for additional keyword arguments. Returns ------- - scalar or `Dataset` - Scalar for `FastArray` input. For `Dataset` input, returns a `Dataset` + scalar or :py:class:`~.rt_dataset.Dataset` + Scalar for :py:class:`~.rt_fastarray.FastArray` input. For + :py:class:`~.rt_dataset.Dataset` input, returns a :py:class:`~.rt_dataset.Dataset` consisting of a row with each numerical column's standard deviation. See Also -------- - std : Computes the standard deviation. - FastArray.nanstd : Computes the standard deviation of `FastArray` values, ignoring - NaNs. - Dataset.nanstd : Computes the standard deviation of numerical `Dataset` columns, - ignoring NaNs. - GroupByOps.nanstd : Computes the standard deviation of each group, ignoring NaNs. - Used by `Categorical` objects. + :py:func:`numpy.nanstd` : + Compute the standard deviation along the specified axis, while ignoring ``NaN`` + values. + :py:func:`.rt_numpy.std` : + Computes the standard deviation. + :py:meth:`.rt_fastarray.FastArray.nanstd` : + Computes the standard deviation of :py:class:`~.rt_fastarray.FastArray` values, + ignoring ``NaN`` values. + :py:meth:`.rt_dataset.Dataset.nanstd` : + Computes the standard deviation of numerical :py:class:`~.rt_dataset.Dataset` + columns, ignoring ``NaN`` values. + :py:meth:`.rt_groupbyops.GroupByOps.nanstd` : + Computes the standard deviation of each group, ignoring ``NaN`` values. Used by + :py:class:`~.rt_categorical.Categorical` objects. Notes ----- - The `dtype` keyword for `rt.nanstd` specifies the data type of the result. This - differs from `numpy.nanstd`, where it specifies the data type used to compute + The ``dtype`` parameter specifies the data type of the result. This differs from + :py:func:`numpy.nanstd`, where it specifies the data type used to compute the standard deviation. Examples @@ -3914,13 +4098,13 @@ def nanstd(*args, filter=None, dtype=None, **kwargs) -> np.number | Dataset: >>> rt.nanstd(a) 1.0 - With a `dtype` specified: + With a ``dtype`` specified: >>> a = rt.FastArray([1, 2, 3, rt.nan]) >>> rt.nanstd(a, dtype = rt.int32) 1 - With filter: + With ``filter``: >>> a = rt.FastArray([1, 2, 3, rt.nan]) >>> b = rt.FastArray([False, True, True, True]) @@ -4068,32 +4252,43 @@ def bincount(*args, **kwargs) -> int: # ------------------------------------------------------- def isnan(*args, **kwargs) -> FastArray | bool: """ - Return True for each element that's a NaN (Not a Number), False otherwise. + Return `True` for each element that's a ``NaN`` (Not a Number), `False` otherwise. Parameters ---------- *args : - See :py:data:`numpy.isnan`. + See :py:obj:`numpy.isnan`. **kwargs : - See :py:data:`numpy.isnan`. + See :py:obj:`numpy.isnan`. Returns ------- - `FastArray` or bool - For array input, a `FastArray` of booleans is returned that's True for each - element that's a NaN, False otherwise. For scalar input, a boolean is returned. + :py:class:`~.rt_fastarray.FastArray` or bool + For array input, a :py:class:`~.rt_fastarray.FastArray` of booleans is returned + that's `True` for each element that's a ``NaN``, `False` otherwise. For scalar + input, a boolean is returned. See Also -------- - riptable.isnotnan, riptable.isnanorzero, FastArray.isnan, FastArray.isnotnan, - FastArray.notna, FastArray.isnanorzero, Categorical.isnan, Categorical.isnotnan, - Categorical.notna, Date.isnan, Date.isnotnan, DateTimeNano.isnan, - DateTimeNano.isnotnan - Dataset.mask_or_isnan : - Return a boolean array that's True for each `Dataset` row that contains - at least one NaN. - Dataset.mask_and_isnan : - Return a boolean array that's True for each all-NaN `Dataset` row. + :py:func:`.rt_numpy.isnotnan` + :py:func:`.rt_numpy.isnanorzero` + :py:meth:`.rt_fastarray.FastArray.isnan` + :py:meth:`.rt_fastarray.FastArray.isnotnan` + :py:meth:`.rt_fastarray.FastArray.notna` + :py:meth:`.rt_fastarray.FastArray.isnanorzero` + :py:meth:`.rt_categorical.Categorical.isnan` + :py:meth:`.rt_categorical.Categorical.isnotnan` + :py:meth:`.rt_categorical.Categorical.notna` + :py:meth:`.rt_datetime.Date.isnan` + :py:meth:`.rt_datetime.Date.isnotnan` + :py:meth:`.rt_datetime.DateTimeNano.isnan` + :py:meth:`.rt_datetime.DateTimeNano.isnotnan` + :py:meth:`.rt_dataset.Dataset.mask_or_isnan` : + Return a boolean array that's `True` for each :py:class:`~.rt_dataset.Dataset` + row that contains at least one ``NaN``. + :py:meth:`.rt_dataset.Dataset.mask_and_isnan` : + Return a boolean array that's `True` for each row that contains only ``NaN`` + values. Examples -------- @@ -4113,33 +4308,43 @@ def isnan(*args, **kwargs) -> FastArray | bool: # ------------------------------------------------------- def isnotnan(*args, **kwargs) -> FastArray | bool: """ - Return True for each element that's not a NaN (Not a Number), False otherwise. + Return `True` for each element that's not a ``NaN`` (Not a Number), `False` otherwise. Parameters ---------- *args : - See :py:data:`numpy.isnan`. + See :py:obj:`numpy.isnan`. **kwargs : - See :py:data:`numpy.isnan`. + See :py:obj:`numpy.isnan`. Returns ------- - `FastArray` or bool - For array input, a `FastArray` of booleans is returned that's True for each - element that's not a NaN, False otherwise. For scalar input, a boolean is - returned. + :py:class:`~.rt_fastarray.FastArray` or bool + For array input, a :py:class:`~.rt_fastarray.FastArray` of booleans is returned + that's `True` for each element that's not a ``NaN``, `False` otherwise. For scalar + input, a boolean is returned. See Also -------- - riptable.isnan, riptable.isnanorzero, FastArray.isnan, FastArray.isnotnan, - FastArray.notna, FastArray.isnanorzero, Categorical.isnan, Categorical.isnotnan, - Categorical.notna, Date.isnan, Date.isnotnan, DateTimeNano.isnan, - DateTimeNano.isnotnan - Dataset.mask_or_isnan : - Return a boolean array that's True for each `Dataset` row that contains - at least one NaN. - Dataset.mask_and_isnan : - Return a boolean array that's True for each all-NaN `Dataset` row. + :py:func:`.rt_numpy.isnan` + :py:func:`.rt_numpy.isnanorzero` + :py:meth:`.rt_fastarray.FastArray.isnan` + :py:meth:`.rt_fastarray.FastArray.isnotnan` + :py:meth:`.rt_fastarray.FastArray.notna` + :py:meth:`.rt_fastarray.FastArray.isnanorzero` + :py:meth:`.rt_categorical.Categorical.isnan` + :py:meth:`.rt_categorical.Categorical.isnotnan` + :py:meth:`.rt_categorical.Categorical.notna` + :py:meth:`.rt_datetime.Date.isnan` + :py:meth:`.rt_datetime.Date.isnotnan` + :py:meth:`.rt_datetime.DateTimeNano.isnan` + :py:meth:`.rt_datetime.DateTimeNano.isnotnan` + :py:meth:`.rt_dataset.Dataset.mask_or_isnan` : + Return a boolean array that's `True` for each :py:class:`~.rt_dataset.Dataset` + row that contains at least one ``NaN``. + :py:meth:`.rt_dataset.Dataset.mask_and_isnan` : + Return a boolean array that's `True` for each row that contains only ``NaN`` + values. Examples -------- @@ -4159,32 +4364,42 @@ def isnotnan(*args, **kwargs) -> FastArray | bool: # ------------------------------------------------------- def isnanorzero(*args, **kwargs) -> FastArray | bool: """ - Return True for each element that's a NaN (Not a Number) or zero, False otherwise. + Return `True` for each element that's a ``NaN`` (Not a Number) or zero, `False` + otherwise. Parameters ---------- *args : - See :py:data:`numpy.isnan`. + See :py:obj:`numpy.isnan`. **kwargs : - See :py:data:`numpy.isnan`. + See :py:obj:`numpy.isnan`. Returns ------- - `FastArray` or bool - For array input, a `FastArray` of booleans is returned that's True for each - element that's a NaN or zero, False otherwise. For scalar input, a boolean is - returned. + :py:class:`~.rt_fastarray.FastArray` or bool + For array input, a :py:class:`~.rt_fastarray.FastArray` of booleans is returned + that's `True` for each element that's a ``NaN`` or zero, `False` otherwise. For + scalar input, a boolean is returned. See Also -------- - FastArray.isnanorzero, riptable.isnan, riptable.isnotnan, FastArray.isnan, - FastArray.isnotnan, Categorical.isnan, Categorical.isnotnan, Date.isnan, - Date.isnotnan, DateTimeNano.isnan, DateTimeNano.isnotnan - Dataset.mask_or_isnan : - Return a boolean array that's True for each `Dataset` row that contains at least - one NaN. - Dataset.mask_and_isnan : - Return a boolean array that's True for each all-NaN `Dataset` row. + :py:func:`.rt_numpy.isnan` + :py:func:`.rt_numpy.isnotnan` + :py:meth:`.rt_fastarray.FastArray.isnan` + :py:meth:`.rt_fastarray.FastArray.isnotnan` + :py:meth:`.rt_fastarray.FastArray.isnanorzero` + :py:meth:`.rt_categorical.Categorical.isnan` + :py:meth:`.rt_categorical.Categorical.isnotnan` + :py:meth:`.rt_datetime.Date.isnan` + :py:meth:`.rt_datetime.Date.isnotnan` + :py:meth:`.rt_datetime.DateTimeNano.isnan` + :py:meth:`.rt_datetime.DateTimeNano.isnotnan` + :py:meth:`.rt_dataset.Dataset.mask_or_isnan` : + Return a boolean array that's `True` for each :py:class:`~.rt_dataset.Dataset` + row that contains at least one ``NaN``. + :py:meth:`.rt_dataset.Dataset.mask_and_isnan` : + Return a boolean array that's `True` for each row that contains only ``NaN`` + values. Examples -------- @@ -4207,46 +4422,52 @@ def isnanorzero(*args, **kwargs) -> FastArray | bool: # ------------------------------------------------------- def isfinite(*args, **kwargs) -> FastArray | bool: """ - Return True for each finite element, False otherwise. + Return `True` for each finite element, `False` otherwise. A value is considered to be finite if it's not positive or negative infinity - or a NaN (Not a Number). + or a ``NaN`` (Not a Number). Parameters ---------- *args : - See :py:data:`numpy.isfinite`. + See :py:obj:`numpy.isfinite`. **kwargs : - See :py:data:`numpy.isfinite`. + See :py:obj:`numpy.isfinite`. Returns ------- - `FastArray` or bool - For array input, a `FastArray` of booleans is returned that's True for each - element that's finite, False otherwise. For scalar input, a boolean is returned. + :py:class:`~.rt_fastarray.FastArray` or bool + For array input, a :py:class:`~.rt_fastarray.FastArray` of booleans is returned + that's `True` for each element that's finite, `False` otherwise. For scalar + input, a boolean is returned. See Also -------- - riptable.isnotfinite, riptable.isinf, riptable.isnotinf, FastArray.isfinite, - FastArray.isnotfinite, FastArray.isinf, FastArray.isnotinf - Dataset.mask_or_isfinite : - Return a boolean array that's True for each `Dataset` row that has at least - one finite value. - Dataset.mask_and_isfinite : - Return a boolean array that's True for each `Dataset` row that contains all - finite values. - Dataset.mask_or_isinf : - Return a boolean array that's True for each `Dataset` row that has at least - one value that's positive or negative infinity. - Dataset.mask_and_isinf : - Return a boolean array that's True for each `Dataset` row that contains all - infinite values. + :py:func:`.rt_numpy.isnotfinite` + :py:func:`.rt_numpy.isinf` + :py:func:`.rt_numpy.isnotinf` + :py:meth:`.rt_fastarray.FastArray.isfinite` + :py:meth:`.rt_fastarray.FastArray.isnotfinite` + :py:meth:`.rt_fastarray.FastArray.isinf` + :py:meth:`.rt_fastarray.FastArray.isnotinf` + :py:meth:`.rt_dataset.Dataset.mask_or_isfinite` : + Return a boolean array that's `True` for each :py:class:`~.rt_dataset.Dataset` + row that has at least one finite value. + :py:meth:`.rt_dataset.Dataset.mask_and_isfinite` : + Return a boolean array that's `True` for each :py:class:`~.rt_dataset.Dataset` + row that contains all finite values. + :py:meth:`.rt_dataset.Dataset.mask_or_isinf` : + Return a boolean array that's `True` for each :py:class:`~.rt_dataset.Dataset` + row that has at least one value that's positive or negative infinity. + :py:meth:`.rt_dataset.Dataset.mask_and_isinf` : + Return a boolean array that's `True` for each :py:class:`~.rt_dataset.Dataset` + row that contains all infinite values. Examples -------- >>> a = rt.FastArray([rt.inf, -rt.inf, rt.nan, 0]) >>> rt.isfinite(a) - FastArray([False, False, False, True]) + FastArray([False, False, False, True]) >>> rt.isfinite(1) True @@ -4260,40 +4481,46 @@ def isfinite(*args, **kwargs) -> FastArray | bool: # ------------------------------------------------------- def isnotfinite(*args, **kwargs) -> FastArray | bool: """ - Return True for each non-finite element, False otherwise. + Return `True` for each non-finite element, `False` otherwise. A value is considered to be finite if it's not positive or negative infinity - or a NaN (Not a Number). + or a ``NaN`` (Not a Number). Parameters ---------- *args : - See :py:data:`numpy.isfinite`. + See :py:obj:`numpy.isfinite`. **kwargs : - See :py:data:`numpy.isfinite`. + See :py:obj:`numpy.isfinite`. Returns ------- - `FastArray` or bool - For array input, a `FastArray` of booleans is returned that's True for each - non-finite element, False otherwise. For scalar input, a boolean is returned. + :py:class:`~.rt_fastarray.FastArray` or bool + For array input, a :py:class:`~.rt_fastarray.FastArray` of booleans is returned + that's `True` for each non-finite element, `False` otherwise. For scalar input, + a boolean is returned. See Also -------- - riptable.isfinite, riptable.isinf, riptable.isnotinf, FastArray.isfinite, - FastArray.isnotfinite, FastArray.isinf, FastArray.isnotinf - Dataset.mask_or_isfinite : - Return a boolean array that's True for each `Dataset` row that has at least - one finite value. - Dataset.mask_and_isfinite : - Return a boolean array that's True for each `Dataset` row that contains all - finite values. - Dataset.mask_or_isinf : - Return a boolean array that's True for each `Dataset` row that has at least - one value that's positive or negative infinity. - Dataset.mask_and_isinf : - Return a boolean array that's True for each `Dataset` row that contains all - infinite values. + :py:func:`.rt_numpy.isfinite` + :py:func:`.rt_numpy.isinf` + :py:func:`.rt_numpy.isnotinf` + :py:meth:`.rt_fastarray.FastArray.isfinite` + :py:meth:`.rt_fastarray.FastArray.isnotfinite` + :py:meth:`.rt_fastarray.FastArray.isinf` + :py:meth:`.rt_fastarray.FastArray.isnotinf` + :py:meth:`.rt_dataset.Dataset.mask_or_isfinite` : + Return a boolean array that's `True` for each :py:class:`~.rt_dataset.Dataset` + row that has at least one finite value. + :py:meth:`.rt_dataset.Dataset.mask_and_isfinite` : + Return a boolean array that's `True` for each :py:class:`~.rt_dataset.Dataset` + row that contains all finite values. + :py:meth:`.rt_dataset.Dataset.mask_or_isinf` : + Return a boolean array that's `True` for each :py:class:`~.rt_dataset.Dataset` + row that has at least one value that's positive or negative infinity. + :py:meth:`.rt_dataset.Dataset.mask_and_isinf` : + Return a boolean array that's `True` for each :py:class:`~.rt_dataset.Dataset` + row that contains all infinite values. Examples -------- @@ -4313,38 +4540,44 @@ def isnotfinite(*args, **kwargs) -> FastArray | bool: # ------------------------------------------------------- def isinf(*args, **kwargs) -> FastArray | bool: """ - Return True for each element that's positive or negative infinity, False otherwise. + Return `True` for each element that's positive or negative infinity, `False` + otherwise. Parameters ---------- *args : - See :py:data:`numpy.isinf`. + See :py:obj:`numpy.isinf`. **kwargs : - See :py:data:`numpy.isinf`. + See :py:obj:`numpy.isinf`. Returns ------- - `FastArray` or bool - For array input, a `FastArray` of booleans is returned that's True for each - element that's positive or negative infinity, False otherwise. For scalar - input, a boolean is returned. + :py:class:`~.rt_fastarray.FastArray` or bool + For array input, a :py:class:`~.rt_fastarray.FastArray` of booleans is returned + that's `True` for each element that's positive or negative infinity, `False` + otherwise. For scalar input, a boolean is returned. See Also -------- - riptable.isnotinf, riptable.isfinite, riptable.isnotfinite, FastArray.isinf, - FastArray.isnotinf, FastArray.isfinite, FastArray.isnotfinite - Dataset.mask_or_isfinite : - Return a boolean array that's True for each `Dataset` row that has at least - one finite value. - Dataset.mask_and_isfinite : - Return a boolean array that's True for each `Dataset` row that contains all - finite values. - Dataset.mask_or_isinf : - Return a boolean array that's True for each `Dataset` row that has at least - one value that's positive or negative infinity. - Dataset.mask_and_isinf : - Return a boolean array that's True for each `Dataset` row that contains all - infinite values. + :py:func:`.rt_numpy.isnotinf` + :py:func:`.rt_numpy.isfinite` + :py:func:`.rt_numpy.isnotfinite` + :py:meth:`.rt_fastarray.FastArray.isinf` + :py:meth:`.rt_fastarray.FastArray.isnotinf` + :py:meth:`.rt_fastarray.FastArray.isfinite` + :py:meth:`.rt_fastarray.FastArray.isnotfinite` + :py:meth:`.rt_dataset.Dataset.mask_or_isfinite` : + Return a boolean array that's `True` for each :py:class:`~.rt_dataset.Dataset` + row that has at least one finite value. + :py:meth:`.rt_dataset.Dataset.mask_and_isfinite` : + Return a boolean array that's `True` for each :py:class:`~.rt_dataset.Dataset` + row that contains all finite values. + :py:meth:`.rt_dataset.Dataset.mask_or_isinf` : + Return a boolean array that's `True` for each :py:class:`~.rt_dataset.Dataset` + row that has at least one value that's positive or negative infinity. + :py:meth:`.rt_dataset.Dataset.mask_and_isinf` : + Return a boolean array that's `True` for each :py:class:`~.rt_dataset.Dataset` + row that contains all infinite values. Examples -------- @@ -4364,39 +4597,44 @@ def isinf(*args, **kwargs) -> FastArray | bool: # ------------------------------------------------------- def isnotinf(*args, **kwargs) -> FastArray | bool: """ - Return True for each element that's not positive or negative infinity, - False otherwise. + Return `True` for each element that's not positive or negative infinity, + `False` otherwise. Parameters ---------- *args : - See :py:data:`numpy.isinf`. + See :py:obj:`numpy.isinf`. **kwargs : - See :py:data:`numpy.isinf`. + See :py:obj:`numpy.isinf`. Returns ------- - `FastArray` or bool - For array input, a `FastArray` of booleans is returned that's True for each - element that's not positive or negative infinity, False otherwise. For scalar - input, a boolean is returned. + :py:class:`~.rt_fastarray.FastArray` or bool + For array input, a :py:class:`~.rt_fastarray.FastArray` of booleans is returned + that's `True` for each element that's not positive or negative infinity, `False` + otherwise. For scalar input, a boolean is returned. See Also -------- - riptable.isinf, FastArray.isnotinf, FastArray.isinf, riptable.isfinite, - riptable.isnotfinite, FastArray.isfinite, FastArray.isnotfinite - Dataset.mask_or_isfinite : - Return a boolean array that's True for each `Dataset` row that has at least - one finite value. - Dataset.mask_and_isfinite : - Return a boolean array that's True for each `Dataset` row that contains all - finite values. - Dataset.mask_or_isinf : - Return a boolean array that's True for each `Dataset` row that has at least - one value that's positive or negative infinity. - Dataset.mask_and_isinf : - Return a boolean array that's True for each `Dataset` row that contains all - infinite values. + :py:func:`.rt_numpy.isinf` + :py:func:`.rt_numpy.isfinite` + :py:func:`.rt_numpy.isnotfinite` + :py:meth:`.rt_fastarray.FastArray.isnotinf` + :py:meth:`.rt_fastarray.FastArray.isinf` + :py:meth:`.rt_fastarray.FastArray.isfinite` + :py:meth:`.rt_fastarray.FastArray.isnotfinite` + :py:meth:`.rt_dataset.Dataset.mask_or_isfinite` : + Return a boolean array that's `True` for each :py:class:`~.rt_dataset.Dataset` + row that has at least one finite value. + :py:meth:`.rt_dataset.Dataset.mask_and_isfinite` : + Return a boolean array that's `True` for each :py:class:`~.rt_dataset.Dataset` + row that contains all finite values. + :py:meth:`.rt_dataset.Dataset.mask_or_isinf` : + Return a boolean array that's `True` for each :py:class:`~.rt_dataset.Dataset` + row that has at least one value that's positive or negative infinity. + :py:meth:`.rt_dataset.Dataset.mask_and_isinf` : + Return a boolean array that's `True` for each :py:class:`~.rt_dataset.Dataset` + row that contains all infinite values. Examples -------- @@ -4530,12 +4768,12 @@ def repeat(a, repeats, axis=None): Parameters ---------- a : array or scalar - The input array or scalar. Each element will be repeated consecutively - `repeats` times. If no `axis` is specified, multi-dimensional arrays are + The input array or scalar. Each element is repeated consecutively + ``repeats`` times. If no ``axis`` is specified, multi-dimensional arrays are flattened and a flattened array is returned. repeats : int or array of int - The number of consecutive repetitions for each element of `a`. If an - `axis` is specified, the elements are repeated along that axis. + The number of consecutive repetitions for each element of ``a``. If an + ``axis`` is specified, the elements are repeated along that axis. axis : int, optional The axis along which to repeat the values. If no axis is specified, the input array is flattened and a flattened array is returned. For examples @@ -4545,13 +4783,13 @@ def repeat(a, repeats, axis=None): Returns ------- - `FastArray` - A new `FastArray` that has the same shape as `a`, except along the given - axis. + :py:class:`~.rt_fastarray.FastArray` + A new :py:class:`~.rt_fastarray.FastArray` that has the same shape as ``a``, + except along the given axis. See Also -------- - riptable.tile : Construct an array by repeating a specified array. + :py:func:`.rt_numpy.tile` : Construct an array by repeating a specified array. Examples -------- @@ -4566,7 +4804,7 @@ def repeat(a, repeats, axis=None): >>> rt.repeat(x, 2) FastArray([1, 1, 2, 2, 3, 3, 4, 4]) - Use an array for `repeats`: + Use an array for ``repeats``: >>> rt.repeat(x, [1, 2, 3, 4]) FastArray([1, 2, 2, 3, 3, 3, 4, 4, 4, 4]) @@ -4581,27 +4819,27 @@ def repeat(a, repeats, axis=None): # ------------------------------------------------------------ def tile(arr, reps): """ - Construct an array by repeating a specified array a specified number of - times. + Construct an array by repeating an input array a specified number of times. Parameters ---------- - a : array or scalar + arr : array or scalar The input array or scalar. - reps: int or array of int - The number of repetitions of `a` along each axis. For examples of `tile` - used with multi-dimensional arrays, see :py:func:`numpy.tile`. Note that - although multi-dimensional arrays are technically supported by Riptable, - you may get unexpected results when working with them. + reps : int or array of int + The number of repetitions of ``arr`` along each axis. For examples of + :py:func:`~.rt_numpy.tile` used with multi-dimensional arrays, see + :py:func:`numpy.tile`. Note that although multi-dimensional arrays are + technically supported by Riptable, you may get unexpected results when working + with them. Returns ------- - `FastArray` - A new `FastArray` of the repeated input arrays. + :py:class:`~.rt_fastarray.FastArray` + A new :py:class:`~.rt_fastarray.FastArray` of the repeated input arrays. See Also -------- - riptable.repeat : + :py:func:`.rt_numpy.repeat` : Construct an array by repeating each element of a specified array. Examples diff --git a/riptable/rt_pdataset.py b/riptable/rt_pdataset.py index 4854cf9..abfcc33 100644 --- a/riptable/rt_pdataset.py +++ b/riptable/rt_pdataset.py @@ -552,7 +552,8 @@ def prow_labeler(self, rownumbers, style): # column header header = "partition + #" - rownumbers = plabels + " " + rownumbers + if len(plabels) > 0: + rownumbers = plabels + " " + rownumbers # set the style width to override the string trim style.width = rownumbers.itemsize diff --git a/riptable/rt_sds.py b/riptable/rt_sds.py index 9db67ca..99c1e9b 100644 --- a/riptable/rt_sds.py +++ b/riptable/rt_sds.py @@ -533,9 +533,12 @@ def _sds_path_multi(path, share=None, overwrite=True): if SDSVerbose: VerbosePrint(f"calling makedirs") if SDSMakeDirs: - os.makedirs(path) + os.makedirs(path, exist_ok=True) # ignore multi-proc create race errors else: - os.mkdir(path) + try: + os.mkdir(path) + except FileExistsError: # ignore multi-proc create race errors + pass # raise ValueError(f'Directory {path!r} does not exist. SDSMakeDirs global variable must be set to auto create sub directories.') return True @@ -613,9 +616,12 @@ def _sds_path_single(path, share=None, overwrite=True, name=None, append=None): if len(path[:-dir_end]) > 0: newpath = path[:-dir_end] if SDSMakeDirs: - os.makedirs(newpath) + os.makedirs(newpath, exist_ok=True) # ignore multi-proc create race errors else: - os.mkdir(newpath) + try: + os.mkdir(newpath) + except FileExistsError: # ignore multi-proc create race errors + pass # raise ValueError(f'Directory {newpath!r} does not exist. SDSMakeDirs global variable must be set to auto create sub directories.') return path, name, True diff --git a/riptable/rt_stats.py b/riptable/rt_stats.py index 30ec48f..4e58b66 100644 --- a/riptable/rt_stats.py +++ b/riptable/rt_stats.py @@ -1,6 +1,5 @@ __all__ = [ "class_error", - "groupScatter", "linear_spline", "lm", "mae", @@ -186,7 +185,7 @@ def linear_spline(X0, Y0, knots, display=True): return knots, coeff -# TODO: Make formatting aware of environment, e.g., Spyder, jupyter, etc. in groupScatter and plotPrediction +# TODO: Make formatting aware of environment, e.g., Spyder, jupyter, etc. in plotPrediction # NOTE: Can we use regPlot from seaborn # won't display in jupyter lab # better auto-detect bounds @@ -257,10 +256,3 @@ def polyFit(x, y, d=1, filter=None): Ay = np.matmul(A.transpose(), y) c = np.linalg.solve(AtA, Ay) return c - - -def groupScatter(*arg, **kwarg): - """ - This function has been moved to playa.stats. - """ - raise NotImplementedError("This function has been moved to playa.plot") diff --git a/riptable/rt_struct.py b/riptable/rt_struct.py index 7bd1e2f..5a24482 100644 --- a/riptable/rt_struct.py +++ b/riptable/rt_struct.py @@ -5137,16 +5137,41 @@ def __eq__(self, other): return (total_physical_size, total_logical_size) - def key_search(self, regex, case_sensitive=False, recursive=True, path=""): + def key_search(self, regex: str, case_sensitive: bool = False, recursive: bool = True, path: str = "") -> list[str]: + """ + Search through the keys of the :py:class:`~.rt_struct.Struct` using + :py:func:`re.search`, and return the resulting list of keys. + + Parameters + ---------- + regex : str + Regular expression (:py:obj:`re`) used to search. + case_sensitive : bool, optional + If `True`, ignore letter case of keys. + recursive : bool, optional + If `True`, applies :py:meth:`~.rt_struct.Struct.key_search` to elements of + the :py:class:`~.rt_struct.Struct` that are also + :py:class:`~.rt_struct.Struct` objects. + path : str, optional + String to prepend to all returned keys. This is useful when ``recursive`` + equals `True`. + + Returns + ------- + list of str + List of keys that match ``regex``. + """ if case_sensitive: pattern = re.compile(regex) else: pattern = re.compile(regex, re.IGNORECASE) output = [path + s for s in self.keys() if pattern.search(s)] - if recursive: + # don't bother recusing into Datasets, as they aren't recursive containers. + if recursive and not isinstance(self, TypeRegister.Dataset): for s in self.keys(): - if isinstance(self[s], Struct): - output = output + self[s].key_search( + ds = self[s] + if isinstance(ds, Struct): + output = output + ds.key_search( regex, case_sensitive=case_sensitive, recursive=recursive, path=path + s + "." ) return output diff --git a/riptable/rt_timezone.py b/riptable/rt_timezone.py index a836ce5..50e920a 100644 --- a/riptable/rt_timezone.py +++ b/riptable/rt_timezone.py @@ -1002,9 +1002,12 @@ class TimeZone: """ Stores daylight savings cutoff information so UTC times can be translated to zone-specific times. + Every `DateTimeNano` object holds a `TimeZone` object. All timezone-related conversions / fixups will be handled by the `TimeZone` class. + |To see supported timezones, use ``rt.TimeZone.valid_timezones``.| + Parameters ---------- from_tz : str, defaults to None @@ -1070,8 +1073,9 @@ def normalize_tz_to_tzdb_name(tz_name: str) -> str: if tz_long_name is not None: return tz_long_name + valid_tzs = '"' + '", "'.join(TimeZone.valid_timezones) + '"' raise ValueError( - f"The timezone name '{tz_name}' is not recognized as either a tz database timezone name or an alias timezone name." + f"The timezone name '{tz_name}' is not recognized as either a tz database timezone name or an alias timezone name. Valid timezones are: {valid_tzs}" ) # ------------------------------------------------------------ diff --git a/riptable/rt_utils.py b/riptable/rt_utils.py index 8bd7e44..8c4d32d 100644 --- a/riptable/rt_utils.py +++ b/riptable/rt_utils.py @@ -46,6 +46,8 @@ def load_h5( filepath: Union[str, os.PathLike], name: str = "/", columns: Union[Sequence[str], "re.Pattern", Callable[..., Sequence[str]]] = "", + condition: Union[None, slice, Sequence[int], Sequence[bool]] = None, + nthreads: Optional[int] = 16, format=None, fixblocks: bool = False, drop_short: bool = False, @@ -69,14 +71,25 @@ def load_h5( If a function is passed, it will be called with column names, dtypes and shapes, and should return a subset of column names. Passing an empty string (the default) loads all columns. + condition : slice or sequence of int or sequence of boolean or None + The condition for choosing the rows to load. This is essentially the "row filter" you would use on a numpy + array, so you can use a slice, a list/array of indices, or a boolean mask list/array aligned with the data. + This parameter support more advanced values (tuple and callable) but they are not implemented in the + multi-threaded hdf5_loader so using them will automatically use the reference HDF5 library and `nthreads` will + have no effect. You can see the usage in `hdf5.load` docstring. + Default: None, no filtering. + nthreads : int or None + The number of threads to use. Set to None if want to use the single-thread reference HDF5 library. Default: 16. format : hdf5.Format - TODO, defaults to hdf5.Format.NDARRAY + The `format` parameter for `hdf5.load`. You should not need to set it. Default: hdf5.Format.NDARRAY. fixblocks : bool True will transpose the rows when the H5 file are as ???, defaults to False. drop_short : bool Set to True to drop short rows and never return a Struct, defaults to False. verbose TODO + **kwargs : dict + Extra arguments passed to `hdf5.load`, see `hdf5.load` docstring for more details. Returns ------- @@ -101,7 +114,16 @@ def load_h5( print(f"starting h5 load {filepath}") # TEMP: Until hdf5.load() implements support for path-like objects, force conversion to str. filepath = os.fspath(filepath) - ws = hdf5.load(filepath, name=name, columns=columns, format=format, **kwargs) + ws = hdf5.load( + filepath, + name=name, + columns=columns, + condition=condition, + format=format, + nthreads=nthreads, + to_columnar=True, + **kwargs, + ) if verbose > 0: print(f"finished h5 load {filepath}") diff --git a/riptable/tests/test_base_function.py b/riptable/tests/test_base_function.py index 1f36d15..b2b488f 100644 --- a/riptable/tests/test_base_function.py +++ b/riptable/tests/test_base_function.py @@ -245,3 +245,53 @@ def test_where_dtype_demoting(self): err_msg=f"array elements greater than the minimum {min}", ) assert dtype == rt_gt_min[0].dtype, "expected dtype of the original type" + + +class TestArange: + @pytest.mark.parametrize( + "args, kwargs", + [ + pytest.param((3,), {}, id="stop"), + pytest.param((10, 13), {}, id="start,stop"), + pytest.param((21, 24, 2), {}, id="start,stop,step"), + pytest.param((33,), dict(dtype=float), id="stop,dtypeF"), + pytest.param((43,), dict(dtype=rt.Date), id="stop,dtypeD"), + pytest.param((51, 3), dict(like=np.array([])), id="start,stop,like=np"), + pytest.param( + (1, 3), + dict(like=rt.FA([])), + id="start,stop,like=FA", + marks=pytest.mark.xfail(strict=True, reason="#310"), + ), + pytest.param((1, 3), dict(like=np.array([]), dtype=np.uint64), id="start,stop,like=np,dtypeU"), + pytest.param((9,), dict(stop=11), id="start,kw_stop"), + pytest.param((8,), dict(stop=21, step=4), id="start,kw_stop,kw_step"), + pytest.param((), dict(start=7, stop=19, step=3), id="kw_start,kw_stop,kw_step"), + pytest.param((), dict(stop=123), id="kw_stop"), + ], + ) + def test_arange(self, args, kwargs): + na = np.arange(*args, **kwargs) + fa = rt.arange(*args, **kwargs) + assert_array_equal(na, fa) + + @pytest.mark.parametrize( + "args, kwargs, ex", + [ + pytest.param((), {}, TypeError, id="empty"), + pytest.param( + (), + dict(start=123), + TypeError, + id="start_only", + marks=pytest.mark.xfail(reason="Cannot distinguish start= from stop", strict=True), + ), + pytest.param((), dict(step=123), TypeError, id="step_only"), + pytest.param((88,), dict(start=87), TypeError, id="kw_start,stop"), + ], + ) + def test_bad_arange(self, args, kwargs, ex): + with pytest.raises(ex): + np.arange(*args, **kwargs) + with pytest.raises(ex): + rt.arange(*args, **kwargs) diff --git a/riptable/tests/test_dataset.py b/riptable/tests/test_dataset.py index fadf590..646915c 100644 --- a/riptable/tests/test_dataset.py +++ b/riptable/tests/test_dataset.py @@ -550,6 +550,20 @@ def test_assign_subclasses(self): self.assertEqual(type(ds.b), subcls) self.assertTrue((ds.a == ds.b).all()) + def test_assign_time_scalars(self): + ds = rt.Dataset({"a": [1, 2, 3]}) + for cls in [ + rt.DateTimeNano, + rt.TimeSpan, + rt.DateSpan, + rt.Date, + ]: + arr = cls(ds.a) + for use_scalar in (True, False): + ds.b = arr[0] if use_scalar else [arr[0]] + self.assertEqual(type(ds.b), type(arr)) + self.assertTrue((ds.b == arr[0].repeat(len(ds))).all()) + def test_assign_cat_null(self): ds = rt.Dataset({"col_0": [4, 5]}) c = rt.Cat([1, 2, 1, 2, 1, 2]) diff --git a/riptable/tests/test_saveload.py b/riptable/tests/test_saveload.py index 08fadaa..6671633 100644 --- a/riptable/tests/test_saveload.py +++ b/riptable/tests/test_saveload.py @@ -1,3 +1,4 @@ +import contextlib import os import pathlib import shutil @@ -16,6 +17,7 @@ from riptable.testing.array_assert import assert_array_or_cat_equal from riptable.testing.randgen import create_test_dataset from riptable.tests.test_utils import get_all_categorical_data +from riptable.tests.utils import get_rc_version, parse_version from riptable.Utils.rt_metadata import MetaData from riptable.Utils.rt_testdata import load_test_data from riptable.Utils.rt_testing import ( @@ -1327,6 +1329,29 @@ def test_load_filter(self): # os.remove(p) +@pytest.mark.parametrize("makedirs", [True, False], ids=["makesubdirs", "makedir"]) +@pytest.mark.parametrize("single", [True, False], ids=["single", "multi"]) +@pytest.mark.parametrize("subdir", ["", "old", "new", "new1/new2"]) +def test_save_subdirs(makedirs, single, subdir): + orig_makedirs = rt_sds.SDSMakeDirs + rt_sds.SDSMakeDirs = makedirs + try: + with tempfile.TemporaryDirectory() as tmpdirname: + obj = rt.Dataset() if single else rt.Struct() + + os.mkdir(os.path.join(tmpdirname, "old")) + path = os.path.join(tmpdirname, subdir, "test.sds") + + shouldnt_mkdirs = (not makedirs) and "/" in subdir + + expected_ex = FileNotFoundError if shouldnt_mkdirs else None + + with pytest.raises(expected_ex) if expected_ex else contextlib.nullcontext(): + rt_sds.save_sds(path, obj) + finally: + rt_sds.SDSMakeDirs = orig_makedirs + + # TODO fold test_sds_stack_with_categorical into the more general test_sds_stack # We will still want to test across various container types, but rt_test_data module should be responsible for that detail @pytest.mark.parametrize("container_type", [Dataset, Struct]) @@ -1433,6 +1458,107 @@ def test_sds_stack(data, stack, stack_count, tmpdir): pytest.fail(f"{fn}: assertions not implemented for data type {type(exp)}\n" + err_msg) +@pytest.mark.parametrize( + "datas, expected", + [ + pytest.param( + [ + rt.Dataset(dict(A=rt.FA([10, 11], dtype="i"))), + rt.Dataset(dict(A=rt.FA([20, 21], dtype="i"))), + ], + rt.PDataset, + id="1D:reg-reg", + ), + pytest.param( + [ + rt.Dataset(dict(A=rt.FA([], dtype="i"))), + rt.Dataset(dict(A=rt.FA([], dtype="i"))), + ], + rt.PDataset, + id="1D:empty-empty", + ), + pytest.param( + [ + rt.Dataset(dict(A=rt.FA([], dtype="i"))), + rt.Dataset(dict(A=rt.FA([20, 21], dtype="i"))), + ], + rt.PDataset, + id="1D:empty-reg", + marks=pytest.mark.xfail( + get_rc_version() < parse_version("1.16.2a"), reason="RIPTABLE-213 - stack(empty,...)" + ), + ), + pytest.param( + [ + rt.Dataset(dict(A=rt.FA([10, 11], dtype="i"))), + rt.Dataset(dict(A=rt.FA([], dtype="i"))), + ], + rt.PDataset, + id="1D:reg-empty", + ), + pytest.param( + [ + rt.Dataset(dict(A=rt.FA([[10, 11, 12], [110, 111, 121]], dtype="i"))), + rt.Dataset(dict(A=rt.FA([[20, 21, 22], [120, 121, 122]], dtype="i"))), + ], + rt.PDataset, + id="2D:reg-reg", + ), + pytest.param( + [ + rt.Dataset(dict(A=rt.empty(shape=(0, 3), dtype="i"))), + rt.Dataset(dict(A=rt.empty(shape=(0, 3), dtype="i"))), + ], + rt.PDataset, + id="2D:empty-empty", + ), + pytest.param( + [ + rt.Dataset(dict(A=rt.empty(shape=(0, 3), dtype="i"))), + rt.Dataset(dict(A=rt.FA([[20, 21, 22], [120, 121, 122]], dtype="i"))), + ], + rt.PDataset, + id="2D:empty-reg", + marks=pytest.mark.xfail( + get_rc_version() < parse_version("1.16.2a"), reason="RIPTABLE-213 - stack(empty,...)" + ), + ), + pytest.param( + [ + rt.Dataset(dict(A=rt.FA([[20, 21, 22], [120, 121, 122]], dtype="i"))), + rt.Dataset(dict(A=rt.empty(shape=(0, 3), dtype="i"))), + ], + rt.PDataset, + id="2D:reg-empty", + marks=pytest.mark.xfail( + get_rc_version() < parse_version("1.16.2a"), reason="RIPTABLE-213 - stack(empty,...)" + ), + ), + ], +) +def test_sds_stack_empty(datas, expected, tmpdir): + if expected is rt.PDataset: + expected = rt.PDataset(datas) + + def zip_strict(a, b): + return zip(a, b, strict=True) if sys.version_info[:2] >= (3, 10) else zip(a, b) + + fnames = [os.path.join(tmpdir, f"sds_{i}.sds") for i in range(len(datas))] + + for data, fname in zip_strict(datas, fnames): + rt.save_sds(fname, data) + + actual = rt.load_sds(fnames, stack=True) + + assert type(actual) == type(expected) + assert actual.pcount == expected.pcount + + for i, ((ne, ve), (na, va)) in enumerate(zip_strict(expected.piter, actual.piter)): + assert len(ve) == len(va) + for re, ra in zip_strict(ve.values(), va.values()): + assert np.all(re == ra) + + @pytest.mark.parametrize( "data", [ diff --git a/riptable/tests/test_struct.py b/riptable/tests/test_struct.py index c87b131..9d097bd 100644 --- a/riptable/tests/test_struct.py +++ b/riptable/tests/test_struct.py @@ -982,8 +982,32 @@ def test_key_search(self): mystruct.substruct = substruct st.Alex = mystruct self.assertEqual( - st.key_search("a.*e"), ["Apple", "Alex", "opp.ale", "Alex.allen", "Alex.substruct.myds2.Valence"] + st.key_search("a.*e", recursive=False), + ["Apple", "Alex"], ) + self.assertEqual( + st.key_search("a.*e", recursive=True), + ["Apple", "Alex", "opp.ale", "Alex.allen", "Alex.substruct.myds2.Valence"], + ) + + def test_key_search_dataset(self): + # Check that key_search does not get any keys from the dataset when recursing + class MockDataset(rt.Dataset): + # This class counts the number of __getitem__ calls. + _n_getitem_calls = 0 + + def __getitem__(self, key): + self._n_getitem_calls += 1 + return super().__getitem__(key) + + ds = MockDataset({"a": [1]}) + self.assertEqual(ds.key_search("a", recursive=True), ["a"]) + assert ds._n_getitem_calls == 0 + + ds = MockDataset({"a": [9.8]}) + st = rt.Struct({"a": [9], "b": ds}) + self.assertEqual(st.key_search("a", recursive=True), ["a", "b.a"]) + assert ds._n_getitem_calls == 0 @pytest.mark.parametrize(