diff --git a/docs/api/dt/cumsum.rst b/docs/api/dt/cumsum.rst new file mode 100644 index 0000000000..8d0c8760e8 --- /dev/null +++ b/docs/api/dt/cumsum.rst @@ -0,0 +1,87 @@ + +.. xfunction:: datatable.cumsum + :src: src/core/expr/fexpr_cumsum.cc pyfn_cumsum + :tests: tests/dt/test-cumsum.py + :cvar: doc_dt_cumsum + :signature: cumsum(cols) + + .. x-version-added:: 1.1.0 + + For each column from `cols` calculate cumulative sum. In the presence of :func:`by()`, + the cumulative sum is computed per group. + + Parameters + ---------- + cols: FExpr + Input data for cumulative sum calculation. + + return: FExpr + f-expression that converts input columns into the columns filled + with the respective cumulative sums. + + except: TypeError + The exception is raised when one of the columns from `cols` + has a non-numeric type. + + + Examples + -------- + + Create a sample datatable frame:: + + >>> from datatable import dt, f + >>> DT = dt.Frame({"A": [2, None, 5, -1, 0], + ... "B": [None, None, None, None, None], + ... "C": [5.4, 3, 2.2, 4.323, 3], + ... "D": ['a', 'a', 'b', 'b', 'b']}) + | A B C D + | int32 void float64 str32 + -- + ----- ---- ------- ----- + 0 | 2 NA 5.4 a + 1 | NA NA 3 a + 2 | 5 NA 2.2 b + 3 | -1 NA 4.323 b + 4 | 0 NA 3 b + [5 rows x 4 columns] + + + Calculate cumulative sum in a single column:: + + >>> DT[:, dt.cumsum(f.A)] + | A + | int64 + -- + ----- + 0 | 2 + 1 | 2 + 2 | 7 + 3 | 6 + 4 | 6 + [5 rows x 1 column] + + + Calculate cumulative sums in multiple columns:: + + >>> DT[:, dt.cumsum(f[:-1])] + | A B C + | int64 int64 float64 + -- + ----- ----- ------- + 0 | 2 0 5.4 + 1 | 2 0 8.4 + 2 | 7 0 10.6 + 3 | 6 0 14.923 + 4 | 6 0 17.923 + [5 rows x 3 columns] + + + Calculate cumulative sums per group in the presence of :func:`by()`:: + + >>> DT[:, dt.cumsum(f[:]), by('D')] + | D A B C + | str32 int64 int64 float64 + -- + ----- ----- ----- ------- + 0 | a 2 0 5.4 + 1 | a 2 0 8.4 + 2 | b 5 0 2.2 + 3 | b 4 0 6.523 + 4 | b 4 0 9.523 + [5 rows x 4 columns] diff --git a/docs/api/fexpr.rst b/docs/api/fexpr.rst index d4ab42a575..14bd7674ea 100644 --- a/docs/api/fexpr.rst +++ b/docs/api/fexpr.rst @@ -163,6 +163,9 @@ * - :meth:`.countna()` - Same as :func:`dt.countna()`. + * - :meth:`.cumsum()` + - Same as :func:`dt.cumsum()`. + * - :meth:`.first()` - Same as :func:`dt.first()`. @@ -286,6 +289,7 @@ .as_type() .count() .countna() + .cumsum() .extend() .first() .last() diff --git a/docs/api/fexpr/cumsum.rst b/docs/api/fexpr/cumsum.rst new file mode 100644 index 0000000000..c18e080f4c --- /dev/null +++ b/docs/api/fexpr/cumsum.rst @@ -0,0 +1,7 @@ + +.. xmethod:: datatable.FExpr.cumsum + :src: src/core/expr/fexpr.cc PyFExpr::cumsum + :cvar: doc_FExpr_cumsum + :signature: cumsum() + + Equivalent to :func:`dt.cumsum(self)`. diff --git a/docs/api/index-api.rst b/docs/api/index-api.rst index 5256e448c5..1d471d2d8a 100644 --- a/docs/api/index-api.rst +++ b/docs/api/index-api.rst @@ -163,6 +163,8 @@ Functions - Count non-missing values per column * - :func:`countna()` - Count the number of NA values per column + * - :func:`cumsum()` + - Calculate the cumulative sum of values per column * - :func:`cov()` - Calculate covariance between two columns * - :func:`max()` @@ -232,6 +234,7 @@ Other count()
countna()
cov()
+ cumsum()
cut()
dt
f
diff --git a/docs/releases/v1.1.0.rst b/docs/releases/v1.1.0.rst index bcfb7c6d3c..c76e26b410 100644 --- a/docs/releases/v1.1.0.rst +++ b/docs/releases/v1.1.0.rst @@ -71,6 +71,9 @@ -[new] Added reducer function :func:`dt.prod()` and the corresponding :meth:`.prod()` method to calculate product of values in columns. [#3140] + -[new] Added function :func:`dt.cumsum()`, as well as :meth:`.cumsum()` method, + to calculate the cumulative sum of values per column. [#3279] + -[enh] Added reducer functions :func:`dt.countna()` and :func:`dt.nunique()`. [#2999] -[new] Class :class:`dt.FExpr` now has method :meth:`.nunique()`, diff --git a/src/core/column/cumsum.h b/src/core/column/cumsum.h new file mode 100644 index 0000000000..4c9c2474de --- /dev/null +++ b/src/core/column/cumsum.h @@ -0,0 +1,92 @@ +//------------------------------------------------------------------------------ +// Copyright 2022 H2O.ai +// +// Permission is hereby granted, free of charge, to any person obtaining a +// copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +// IN THE SOFTWARE. +//------------------------------------------------------------------------------ +#ifndef dt_COLUMN_CUMSUM_h +#define dt_COLUMN_CUMSUM_h +#include "column/virtual.h" +#include "parallel/api.h" +#include "stype.h" + +namespace dt { + + +template +class Cumsum_ColumnImpl : public Virtual_ColumnImpl { + private: + Column col_; + Groupby gby_; + + public: + Cumsum_ColumnImpl(Column&& col, const Groupby& gby) + : Virtual_ColumnImpl(col.nrows(), col.stype()), + col_(std::move(col)), + gby_(gby) + { + xassert(col_.can_be_read_as()); + } + + + void materialize(Column& col_out, bool) override { + Column col = Column::new_data_column(col_.nrows(), col_.stype()); + auto data = static_cast(col.get_data_editable()); + + auto offsets = gby_.offsets_r(); + dt::parallel_for_dynamic( + gby_.size(), + [&](size_t gi) { + size_t i1 = size_t(offsets[gi]); + size_t i2 = size_t(offsets[gi + 1]); + + T val; + bool is_valid = col_.get_element(i1, &val); + data[i1] = is_valid? val : 0; + + for (size_t i = i1 + 1; i < i2; ++i) { + is_valid = col_.get_element(i, &val); + data[i] = data[i - 1] + (is_valid? val : 0); + } + + }); + + col_out = std::move(col); + } + + + ColumnImpl* clone() const override { + return new Cumsum_ColumnImpl(Column(col_), gby_); + } + + size_t n_children() const noexcept override { + return 1; + } + + const Column& child(size_t i) const override { + xassert(i == 0); (void)i; + return col_; + } + +}; + + +} // namespace dt + + +#endif diff --git a/src/core/documentation.h b/src/core/documentation.h index ad26b124af..3f4560e3ae 100644 --- a/src/core/documentation.h +++ b/src/core/documentation.h @@ -23,7 +23,6 @@ #define dt_DOCUMENTATION_h namespace dt { - extern const char* doc_dt_as_type; extern const char* doc_dt_by; extern const char* doc_dt_cbind; @@ -31,6 +30,7 @@ extern const char* doc_dt_corr; extern const char* doc_dt_count; extern const char* doc_dt_countna; extern const char* doc_dt_cov; +extern const char* doc_dt_cumsum; extern const char* doc_dt_cut; extern const char* doc_dt_first; extern const char* doc_dt_fread; @@ -279,6 +279,7 @@ extern const char* doc_FExpr; extern const char* doc_FExpr_as_type; extern const char* doc_FExpr_count; extern const char* doc_FExpr_countna; +extern const char* doc_FExpr_cumsum; extern const char* doc_FExpr_extend; extern const char* doc_FExpr_first; extern const char* doc_FExpr_last; diff --git a/src/core/expr/fexpr.cc b/src/core/expr/fexpr.cc index 31c62f6a1a..051f753001 100644 --- a/src/core/expr/fexpr.cc +++ b/src/core/expr/fexpr.cc @@ -578,6 +578,16 @@ DECLARE_METHOD(&PyFExpr::countna) ->name("countna") ->docs(dt::doc_FExpr_countna); + +oobj PyFExpr::cumsum(const XArgs&) { + auto cumsumFn = oobj::import("datatable", "cumsum"); + return cumsumFn.call({this}); +} + +DECLARE_METHOD(&PyFExpr::cumsum) + ->name("cumsum") + ->docs(dt::doc_FExpr_cumsum); + //------------------------------------------------------------------------------ // Class decoration //------------------------------------------------------------------------------ diff --git a/src/core/expr/fexpr.h b/src/core/expr/fexpr.h index 93d6c25718..8cc08916f6 100644 --- a/src/core/expr/fexpr.h +++ b/src/core/expr/fexpr.h @@ -182,6 +182,7 @@ class PyFExpr : public py::XObject { py::oobj as_type(const py::XArgs&); py::oobj count(const py::XArgs&); py::oobj countna(const py::XArgs&); + py::oobj cumsum(const py::XArgs&); py::oobj extend(const py::XArgs&); py::oobj first(const py::XArgs&); py::oobj last(const py::XArgs&); diff --git a/src/core/expr/fexpr_cumsum.cc b/src/core/expr/fexpr_cumsum.cc new file mode 100644 index 0000000000..52dccaca21 --- /dev/null +++ b/src/core/expr/fexpr_cumsum.cc @@ -0,0 +1,113 @@ +//------------------------------------------------------------------------------ +// Copyright 2022 H2O.ai +// +// Permission is hereby granted, free of charge, to any person obtaining a +// copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +// IN THE SOFTWARE. +//------------------------------------------------------------------------------ +#include "column/const.h" +#include "column/cumsum.h" +#include "column/latent.h" +#include "documentation.h" +#include "expr/fexpr_func.h" +#include "expr/eval_context.h" +#include "expr/workframe.h" +#include "python/xargs.h" +#include "stype.h" + + +namespace dt { +namespace expr { + +class FExpr_cumsum : public FExpr_Func { + private: + ptrExpr arg_; + + public: + FExpr_cumsum(ptrExpr&& arg) + : arg_(std::move(arg)) {} + + std::string repr() const override{ + std::string out = "cumsum("; + out += arg_->repr(); + out += ')'; + return out; + } + + + Workframe evaluate_n(EvalContext& ctx) const override{ + Workframe wf = arg_->evaluate_n(ctx); + Groupby gby = Groupby::single_group(wf.nrows()); + + if (ctx.has_groupby()) { + wf.increase_grouping_mode(Grouping::GtoALL); + gby = ctx.get_groupby(); + } + + for (size_t i = 0; i < wf.ncols(); ++i) { + Column coli = evaluate1(wf.retrieve_column(i), gby); + wf.replace_column(i, std::move(coli)); + } + return wf; + } + + + Column evaluate1(Column&& col, const Groupby& gby) const { + SType stype = col.stype(); + switch (stype) { + case SType::VOID: + case SType::BOOL: + case SType::INT8: + case SType::INT16: + case SType::INT32: + case SType::INT64: return make(std::move(col), SType::INT64, gby); + case SType::FLOAT32: return make(std::move(col), SType::FLOAT32, gby); + case SType::FLOAT64: return make(std::move(col), SType::FLOAT64, gby); + default: throw TypeError() + << "Invalid column of type " << stype << " in " << repr(); + } + } + + + template + Column make(Column&& col, SType stype, const Groupby& gby) const { + if (col.stype() == SType::VOID) { + return Column(new ConstInt_ColumnImpl(col.nrows(), 0, stype)); + } else { + col.cast_inplace(stype); + return Column(new Latent_ColumnImpl( + new Cumsum_ColumnImpl(std::move(col), gby) + )); + } + } +}; + + +static py::oobj pyfn_cumsum(const py::XArgs& args) { + auto cumsum = args[0].to_oobj(); + return PyFExpr::make(new FExpr_cumsum(as_fexpr(cumsum))); +} + + +DECLARE_PYFN(&pyfn_cumsum) + ->name("cumsum") + ->docs(doc_dt_cumsum) + ->arg_names({"cumsum"}) + ->n_positional_args(1) + ->n_required_args(1); + +}} // dt::expr diff --git a/src/datatable/__init__.py b/src/datatable/__init__.py index 3e77525b81..f23550cf0b 100644 --- a/src/datatable/__init__.py +++ b/src/datatable/__init__.py @@ -27,6 +27,7 @@ as_type, by, cbind, + cumsum, cut, fread, FExpr, @@ -84,6 +85,7 @@ "corr", "count", "cov", + "cumsum", "cut", "dt", "exp", diff --git a/tests/dt/test-cumsum.py b/tests/dt/test-cumsum.py new file mode 100644 index 0000000000..92ed3defbe --- /dev/null +++ b/tests/dt/test-cumsum.py @@ -0,0 +1,102 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +#------------------------------------------------------------------------------- +# Copyright 2022 H2O.ai +# +# Permission is hereby granted, free of charge, to any person obtaining a +# copy of this software and associated documentation files (the "Software"), +# to deal in the Software without restriction, including without limitation +# the rights to use, copy, modify, merge, publish, distribute, sublicense, +# and/or sell copies of the Software, and to permit persons to whom the +# Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +# IN THE SOFTWARE. +#------------------------------------------------------------------------------- +import math +import pytest +from datatable import dt, f, cumsum, FExpr, by +from tests import assert_equals + + +#------------------------------------------------------------------------------- +# Errors +#------------------------------------------------------------------------------- + +def test_cumsum_non_numeric(): + DT = dt.Frame(list('abcde')) + with pytest.raises(TypeError, match = r'Invalid column of type str32 in cumsum'): + DT[:, cumsum(f[0])] + +def test_cumsum_non_numeric_by(): + DT = dt.Frame(list('abcde')) + with pytest.raises(TypeError, match = r'Invalid column of type str32 in cumsum'): + DT[:, cumsum(f[0]), by(f[0])] + +def test_cumsum_no_argument(): + match = r'Function datatable.cumsum\(\) requires exactly 1 positional argument, ' \ + 'but none were given' + with pytest.raises(TypeError, match = match): + dt.cumsum() + + +#------------------------------------------------------------------------------- +# Normal +#------------------------------------------------------------------------------- + +def test_cumsum_str(): + assert str(cumsum(f.A)) == "FExpr" + assert str(cumsum(f.A) + 1) == "FExpr" + assert str(cumsum(f.A + f.B)) == "FExpr" + assert str(cumsum(f.B)) == "FExpr" + assert str(cumsum(f[:2])) == "FExpr" + + +def test_cumsum_empty_frame(): + DT = dt.Frame() + expr_cumsum = cumsum(DT) + assert isinstance(expr_cumsum, FExpr) + assert_equals(DT[:, f[:]], DT) + + +def test_cumsum_void(): + DT = dt.Frame([None, None, None]) + DT_cumsum = DT[:, cumsum(f[:])] + assert_equals(DT_cumsum, dt.Frame([0, 0, 0]/dt.int64)) + + +def test_cumsum_trivial(): + DT = dt.Frame([0]/dt.int64) + cumsum_fexpr = cumsum(f[:]) + DT_cumsum = DT[:, cumsum_fexpr] + assert isinstance(cumsum_fexpr, FExpr) + assert_equals(DT, DT_cumsum) + + +def test_cumsum_small(): + DT = dt.Frame([range(5), [-1, 1, None, 2, 5.5]]) + DT_cumsum = DT[:, cumsum(f[:])] + DT_ref = dt.Frame([[0, 1, 3, 6, 10]/dt.int64, [-1, 0, 0, 2, 7.5]]) + assert_equals(DT_cumsum, DT_ref) + + +def test_cumsum_groupby(): + DT = dt.Frame([[2, 1, 1, 1, 2], [1.5, -1.5, math.inf, 2, 3]]) + DT_cumsum = DT[:, cumsum(f[:]), by(f[0])] + DT_ref = dt.Frame([[1, 1, 1, 2, 2], [-1.5, math.inf, math.inf, 1.5, 4.5]/dt.float64]) + assert_equals(DT_cumsum, DT_ref) + + +def test_cumsum_grouped_column(): + DT = dt.Frame([2, 1, None, 1, 2]) + DT_cumsum = DT[:, cumsum(f[0]), by(f[0])] + DT_ref = dt.Frame([[None, 1, 1, 2, 2], [0, 1, 2, 2, 4]/dt.int64]) + assert_equals(DT_cumsum, DT_ref) diff --git a/tests/test-f.py b/tests/test-f.py index 0f20d1b2b2..87737940b3 100644 --- a/tests/test-f.py +++ b/tests/test-f.py @@ -421,3 +421,9 @@ def test_countna(): DT = dt.Frame(A = [9, 8, 2, 3, None, None, 3, 0, 5, 5, 8, None, 1]) assert_equals(DT[:, f.A.countna()], DT[:, dt.countna(f.A)]) +def test_cumsum(): + assert str(dt.cumsum(f.A)) == str(f.A.cumsum()) + assert str(dt.cumsum(f[:])) == str(f[:].cumsum()) + DT = dt.Frame(A = [9, 8, 2, 3, None, None, 3, 0, 5, 5, 8, None, 1]) + assert_equals(DT[:, f.A.cumsum()], DT[:, dt.cumsum(f.A)]) +