diff --git a/docs/api/dt/fillna.rst b/docs/api/dt/fillna.rst new file mode 100644 index 0000000000..4a99e69d79 --- /dev/null +++ b/docs/api/dt/fillna.rst @@ -0,0 +1,152 @@ + +.. xfunction:: datatable.fillna + :src: src/core/expr/fexpr_fillna.cc pyfn_fillna + :tests: tests/dt/test-fillna.py + :cvar: doc_dt_fillna + :signature: fillna(cols, reverse=False) + + .. x-version-added:: 1.1.0 + + For each column from `cols` fill the missing values with the + previous or subsequent non-missing values. In the presence of :func:`by()` + the filling is performed group-wise. + + Parameters + ---------- + cols: FExpr + Input columns. + + reverse: bool + If ``False``, the missing values are filled by using the closest + previous non-missing values as a replacement. if ``True``, + the closest subsequent non-missing values are used. + + return: FExpr + f-expression that converts input columns into the columns filled + with the previous/subsequent non-missing values. + + + Examples + -------- + + Create a sample datatable frame:: + + >>> from datatable import dt, f, by + >>> DT = dt.Frame({'building': ['a', 'a', 'b', 'b', 'a', 'a', 'b', 'b'], + ... 'var1': [1.5, None, 2.1, 2.2, 1.2, 1.3, 2.4, None], + ... 'var2': [100, 110, 105, None, 102, None, 103, 107], + ... 'var3': [10, 11, None, None, None, None, None, None], + ... 'var4': [1, 2, 3, 4, 5, 6, 7, 8]}) + | building var1 var2 var3 var4 + | str32 float64 int32 int32 int32 + -- + -------- ------- ----- ----- ----- + 0 | a 1.5 100 10 1 + 1 | a NA 110 11 2 + 2 | b 2.1 105 NA 3 + 3 | b 2.2 NA NA 4 + 4 | a 1.2 102 NA 5 + 5 | a 1.3 NA NA 6 + 6 | b 2.4 103 NA 7 + 7 | b NA 107 NA 8 + [8 rows x 5 columns] + + Fill down on a single column:: + + >>> DT[:, dt.fillna(f.var1)] + | var1 + | float64 + -- + ------- + 0 | 1.5 + 1 | 1.5 + 2 | 2.1 + 3 | 2.2 + 4 | 1.2 + 5 | 1.3 + 6 | 2.4 + 7 | 2.4 + [8 rows x 1 column] + + + Fill up on a single column:: + + >>> DT[:, dt.fillna(f.var1, reverse = True)] + | var1 + | float64 + -- + ------- + 0 | 1.5 + 1 | 2.1 + 2 | 2.1 + 3 | 2.2 + 4 | 1.2 + 5 | 1.3 + 6 | 2.4 + 7 | NA + [8 rows x 1 column] + + + Fill down on multiple columns:: + + >>> DT[:, dt.fillna(f['var1':])] + | var1 var2 var3 var4 + | float64 int32 int32 int32 + -- + ------- ----- ----- ----- + 0 | 1.5 100 10 1 + 1 | 1.5 110 11 2 + 2 | 2.1 105 11 3 + 3 | 2.2 105 11 4 + 4 | 1.2 102 11 5 + 5 | 1.3 102 11 6 + 6 | 2.4 103 11 7 + 7 | 2.4 107 11 8 + [8 rows x 4 columns] + + + Fill up on multiple columns:: + + >>> DT[:, dt.fillna(f['var1':], reverse = True)] + | var1 var2 var3 var4 + | float64 int32 int32 int32 + -- + ------- ----- ----- ----- + 0 | 1.5 100 10 1 + 1 | 2.1 110 11 2 + 2 | 2.1 105 NA 3 + 3 | 2.2 102 NA 4 + 4 | 1.2 102 NA 5 + 5 | 1.3 103 NA 6 + 6 | 2.4 103 NA 7 + 7 | NA 107 NA 8 + [8 rows x 4 columns] + + + Fill down in the presence of :func:`by()`:: + + >>> DT[:, dt.fillna(f['var1':]), by('building')] + | building var1 var2 var3 var4 + | str32 float64 int32 int32 int32 + -- + -------- ------- ----- ----- ----- + 0 | a 1.5 100 10 1 + 1 | a 1.5 110 11 2 + 2 | a 1.2 102 11 5 + 3 | a 1.3 102 11 6 + 4 | b 2.1 105 NA 3 + 5 | b 2.2 105 NA 4 + 6 | b 2.4 103 NA 7 + 7 | b 2.4 107 NA 8 + [8 rows x 5 columns] + + + Fill up in the presence of :func:`by()`:: + + >>> DT[:, dt.fillna(f['var1':], reverse = True), by('building')] + | building var1 var2 var3 var4 + | str32 float64 int32 int32 int32 + -- + -------- ------- ----- ----- ----- + 0 | a 1.5 100 10 1 + 1 | a 1.2 110 11 2 + 2 | a 1.2 102 NA 5 + 3 | a 1.3 NA NA 6 + 4 | b 2.1 105 NA 3 + 5 | b 2.2 103 NA 4 + 6 | b 2.4 103 NA 7 + 7 | b NA 107 NA 8 + [8 rows x 5 columns] diff --git a/docs/api/fexpr.rst b/docs/api/fexpr.rst index 50b2c261dd..67e111df3e 100644 --- a/docs/api/fexpr.rst +++ b/docs/api/fexpr.rst @@ -175,6 +175,9 @@ * - :meth:`.cumsum()` - Same as :func:`dt.cumsum()`. + * - :meth:`.fillna()` + - Same as :func:`dt.fillna()`. + * - :meth:`.first()` - Same as :func:`dt.first()`. @@ -303,6 +306,7 @@ .cumprod() .cumsum() .extend() + .fillna() .first() .last() .len() diff --git a/docs/api/fexpr/fillna.rst b/docs/api/fexpr/fillna.rst new file mode 100644 index 0000000000..1f32ee9643 --- /dev/null +++ b/docs/api/fexpr/fillna.rst @@ -0,0 +1,7 @@ + +.. xmethod:: datatable.FExpr.fillna + :src: src/core/expr/fexpr.cc PyFExpr::fillna + :cvar: doc_FExpr_fillna + :signature: fillna(reverse=False) + + Equivalent to :func:`dt.fillna(cols, reverse=False)`. diff --git a/docs/api/index-api.rst b/docs/api/index-api.rst index b0a5d12b18..48bdc3ddfe 100644 --- a/docs/api/index-api.rst +++ b/docs/api/index-api.rst @@ -175,6 +175,8 @@ Functions - Calculate the cumulative sum of values per column * - :func:`cov()` - Calculate covariance between two columns + * - :func:`fillna()` + - Impute missing values * - :func:`max()` - Find the largest element per column * - :func:`mean()` @@ -252,6 +254,7 @@ Other cut()
dt
f
+ fillna()
first()
fread()
g
diff --git a/docs/manual/comparison_with_rdatatable.rst b/docs/manual/comparison_with_rdatatable.rst index d1890111d5..ed9c513637 100644 --- a/docs/manual/comparison_with_rdatatable.rst +++ b/docs/manual/comparison_with_rdatatable.rst @@ -666,7 +666,6 @@ equivalent in ``datatable`` yet, that we would likely implement - Missing values functions - - `nafill `__ - `fcoalesce `__ Also, at the moment, custom aggregations in the ``j`` section are not supported diff --git a/docs/releases/v1.1.0.rst b/docs/releases/v1.1.0.rst index cd7881c4d5..4822995c50 100644 --- a/docs/releases/v1.1.0.rst +++ b/docs/releases/v1.1.0.rst @@ -98,6 +98,9 @@ -[new] Class :class:`dt.FExpr` now has method :meth:`.countna()`, which behaves exactly as the equivalent base level function :func:`dt.countna()`. + -[new] Added function :func:`dt.fillna()`, as well as :meth:`.fillna()` method, + to impute missing values. [#3279] + -[enh] Function :func:`dt.re.match()` now supports case insensitive matching. [#3216] -[enh] Function :func:`dt.qcut()` can now be used in a groupby context. [#3165] diff --git a/src/core/documentation.h b/src/core/documentation.h index 1630cfb339..978717d2f7 100644 --- a/src/core/documentation.h +++ b/src/core/documentation.h @@ -36,6 +36,7 @@ extern const char* doc_dt_cummin; extern const char* doc_dt_cumprod; extern const char* doc_dt_cumsum; extern const char* doc_dt_cut; +extern const char* doc_dt_fillna; extern const char* doc_dt_first; extern const char* doc_dt_fread; extern const char* doc_dt_ifelse; @@ -289,6 +290,7 @@ extern const char* doc_FExpr_cummin; extern const char* doc_FExpr_cumprod; extern const char* doc_FExpr_cumsum; extern const char* doc_FExpr_extend; +extern const char* doc_FExpr_fillna; extern const char* doc_FExpr_first; extern const char* doc_FExpr_last; extern const char* doc_FExpr_max; diff --git a/src/core/expr/fexpr.cc b/src/core/expr/fexpr.cc index 33d16e76e5..8a8fcfd848 100644 --- a/src/core/expr/fexpr.cc +++ b/src/core/expr/fexpr.cc @@ -365,6 +365,21 @@ DECLARE_METHOD(&PyFExpr::cumsum) ->docs(dt::doc_FExpr_cumsum); + +oobj PyFExpr::fillna(const XArgs& args) { + auto fillnaFn = oobj::import("datatable", "fillna"); + oobj reverse = args[0]? args[0].to_oobj() : py::obool(false); + return fillnaFn.call({this, reverse}); +} + +DECLARE_METHOD(&PyFExpr::fillna) + ->name("fillna") + ->docs(dt::doc_FExpr_fillna) + ->arg_names({"reverse"}) + ->n_positional_or_keyword_args(1) + ->n_required_args(0); + + oobj PyFExpr::first(const XArgs&) { auto firstFn = oobj::import("datatable", "first"); return firstFn.call({this}); diff --git a/src/core/expr/fexpr.h b/src/core/expr/fexpr.h index 6fac207f0a..0503a0ab45 100644 --- a/src/core/expr/fexpr.h +++ b/src/core/expr/fexpr.h @@ -187,6 +187,7 @@ class PyFExpr : public py::XObject { py::oobj cumprod(const py::XArgs&); py::oobj cumsum(const py::XArgs&); py::oobj extend(const py::XArgs&); + py::oobj fillna(const py::XArgs&); py::oobj first(const py::XArgs&); py::oobj last(const py::XArgs&); py::oobj max(const py::XArgs&); diff --git a/src/core/expr/fexpr_fillna.cc b/src/core/expr/fexpr_fillna.cc new file mode 100644 index 0000000000..1768d56985 --- /dev/null +++ b/src/core/expr/fexpr_fillna.cc @@ -0,0 +1,138 @@ +//------------------------------------------------------------------------------ +// Copyright 2022 H2O.ai +// +// Permission is hereby granted, free of charge, to any person obtaining a +// copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +// IN THE SOFTWARE. +//------------------------------------------------------------------------------ +#include "documentation.h" +#include "expr/fexpr_func.h" +#include "expr/eval_context.h" +#include "python/xargs.h" +#include "parallel/api.h" +namespace dt { +namespace expr { + + +class FExpr_FillNA : public FExpr_Func { + private: + ptrExpr arg_; + bool reverse_; + size_t : 56; + + public: + FExpr_FillNA(ptrExpr &&arg, bool reverse) + : arg_(std::move(arg)), + reverse_(reverse) + {} + + + std::string repr() const override { + std::string out = "fillna"; + out += '('; + out += arg_->repr(); + out += ", reverse="; + out += reverse_? "True" : "False"; + out += ')'; + return out; + } + + + template + static RowIndex fill_rowindex(Column& col, const Groupby& gby) { + Buffer buf = Buffer::mem(static_cast(col.nrows()) * sizeof(int32_t)); + auto indices = static_cast(buf.xptr()); + + dt::parallel_for_dynamic( + gby.size(), + [&](size_t gi) { + size_t i1, i2; + gby.get_group(gi, &i1, &i2); + size_t fill_id = REVERSE? i2 - 1 : i1; + + if (REVERSE) { + for (size_t i = i2; i-- > i1;) { + size_t is_valid = col.get_element_isvalid(i); + fill_id = is_valid? i : fill_id; + indices[i] = static_cast(fill_id); + } + } else { + for (size_t i = i1; i < i2; ++i) { + size_t is_valid = col.get_element_isvalid(i); + fill_id = is_valid? i : fill_id; + indices[i] = static_cast(fill_id); + } + } + + } + ); + + return RowIndex(std::move(buf), RowIndex::ARR32|RowIndex::SORTED); + } + + + Workframe evaluate_n(EvalContext &ctx) const override { + Workframe wf = arg_->evaluate_n(ctx); + Groupby gby = Groupby::single_group(wf.nrows()); + if (ctx.has_groupby()) { + wf.increase_grouping_mode(Grouping::GtoALL); + gby = ctx.get_groupby(); + } + + for (size_t i = 0; i < wf.ncols(); ++i) { + Column coli = wf.retrieve_column(i); + bool is_grouped = ctx.has_group_column( + wf.get_frame_id(i), + wf.get_column_id(i) + ); + + auto stats = coli.get_stats_if_exist(); + bool na_stats_exists = stats && stats->is_computed(Stat::NaCount); + bool has_nas = na_stats_exists? stats->nacount() + : true; + + if (has_nas && !is_grouped){ + RowIndex ri = reverse_? fill_rowindex(coli, gby) + : fill_rowindex(coli, gby); + coli.apply_rowindex(ri); + } + wf.replace_column(i, std::move(coli)); + } + + return wf; + } + +}; + + +static py::oobj pyfn_fillna(const py::XArgs &args) { + auto column = args[0].to_oobj(); + auto reverse = args[1].to(false); + return PyFExpr::make(new FExpr_FillNA(as_fexpr(column), reverse)); +} + + +DECLARE_PYFN(&pyfn_fillna) + ->name("fillna") + ->docs(doc_dt_fillna) + ->arg_names({"column", "reverse"}) + ->n_required_args(1) + ->n_positional_args(1) + ->n_positional_or_keyword_args(1); + + +}} // dt::expr diff --git a/src/datatable/__init__.py b/src/datatable/__init__.py index 5c981decb8..efdb76e3ec 100644 --- a/src/datatable/__init__.py +++ b/src/datatable/__init__.py @@ -33,8 +33,9 @@ cumprod, cumsum, cut, - fread, FExpr, + fillna, + fread, ifelse, init_styles, intersect, @@ -98,8 +99,9 @@ "cut", "dt", "exp", - "f", "FExpr", + "f", + "fillna", "first", "float32", "float64", diff --git a/tests/dt/test-fillna.py b/tests/dt/test-fillna.py new file mode 100644 index 0000000000..25cee1ee3f --- /dev/null +++ b/tests/dt/test-fillna.py @@ -0,0 +1,141 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +#------------------------------------------------------------------------------- +# Copyright 2022 H2O.ai +# +# Permission is hereby granted, free of charge, to any person obtaining a +# copy of this software and associated documentation files (the "Software"), +# to deal in the Software without restriction, including without limitation +# the rights to use, copy, modify, merge, publish, distribute, sublicense, +# and/or sell copies of the Software, and to permit persons to whom the +# Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +# IN THE SOFTWARE. +#------------------------------------------------------------------------------- +import pytest +from datatable import dt, f, fillna, FExpr, by +from tests import assert_equals + + +#------------------------------------------------------------------------------- +# Errors +#------------------------------------------------------------------------------- + +msg = "Argument reverse in function datatable.fillna\\(\\) should be a boolean, " +msg += "instead got " +def test_fillna_reverse_not_a_boolean(): + DT = dt.Frame([1, 2, None, 4, 5]) + with pytest.raises(TypeError, match = msg): + DT[:, fillna(f[0], reverse='True')] + + +def test_fillna_reverse_not_a_boolean_by(): + DT = dt.Frame([1, 2, None, 4, 5]) + with pytest.raises(TypeError, match = msg): + DT[:, fillna(f[0], reverse = 'True'), by(f[0])] + + +def test_fillna_no_argument(): + msg = (f"Function datatable.fillna\\(\\) " + "requires at least 1 positional argument, but none were given") + with pytest.raises(TypeError, match = msg): + fillna() + + +#------------------------------------------------------------------------------- +# Normal +#------------------------------------------------------------------------------- + +def test_fillna_str(): + assert str(fillna(f.A, reverse=False)) == "FExpr<" + fillna.__name__ + "(f.A, reverse=False)>" + assert str(fillna(f.A, reverse=False) + 1) == "FExpr<" + fillna.__name__ + "(f.A, reverse=False) + 1>" + assert str(fillna(f.A + f.B, reverse = True)) == "FExpr<" + fillna.__name__ + "(f.A + f.B, reverse=True)>" + assert str(fillna(f.B, reverse = True)) == "FExpr<" + fillna.__name__ + "(f.B, reverse=True)>" + assert str(fillna(f[:2], reverse = False)) == "FExpr<"+ fillna.__name__ + "(f[:2], reverse=False)>" + + +def test_fillna_empty_frame(): + DT = dt.Frame() + expr_fillna = fillna(DT, reverse=False) + assert isinstance(expr_fillna, FExpr) + assert_equals(DT[:, fillna(f[:], reverse=False)], DT) + + +def test_fillna_void(): + DT = dt.Frame([None, None, None]) + DT_fillna = DT[:, fillna(f[:], reverse=True)] + assert_equals(DT_fillna, DT) + + +def test_fillna_trivial(): + DT = dt.Frame([0]/dt.int64) + fillna_fexpr = fillna(f[:], reverse = True) + DT_fillna = DT[:, fillna_fexpr] + assert isinstance(fillna_fexpr, FExpr) + assert_equals(DT, DT_fillna) + + +def test_fillna_bool(): + DT = dt.Frame([None, False, None, True, False, True]) + DT_fillna = DT[:, [fillna(f[:], reverse = False), + fillna(f[:], reverse = True)]] + DT_ref = dt.Frame([ + [None, False, False, True, False, True], + [False, False, True, True, False, True] + ]) + assert_equals(DT_fillna, DT_ref) + + +def test_fillna_small(): + DT = dt.Frame([None, 3, None, 4]) + DT_fillna = DT[:, [fillna(f[:], reverse = False), + fillna(f[:], reverse = True)]] + DT_ref = dt.Frame([ + [None, 3, 3, 4], + [3, 3, 4, 4] + ]) + assert_equals(DT_fillna, DT_ref) + + +def test_fillna_string(): + DT = dt.Frame([None, 'a', None, 'b']) + DT_fillna = DT[:, [fillna(f[:]), fillna(f[:], reverse = True)]] + DT_ref = dt.Frame([ + [None, 'a', 'a', 'b'], + ['a', 'a', 'b', 'b'] + ]) + assert_equals(DT_fillna, DT_ref) + + +def test_fillna_grouped(): + DT = dt.Frame([[15, None, 136, 93, 743, None, None, 91], + ['a','a','a','b','b','c','c','c']]) + DT_fillna = DT[:, [fillna(f[:]), fillna(f[:], reverse = True)], by(f[-1])] + DT_ref = dt.Frame({ + 'C1':['a','a','a','b','b','c','c','c'], + 'C0':[15, 15, 136, 93, 743, None, None, 91], + 'C2':[15, 136, 136, 93, 743, 91, 91, 91], + }) + assert_equals(DT_fillna, DT_ref) + + +def test_fillna_grouped_column(): + DT = dt.Frame([2, 1, None, 1, 2]) + DT_bfill = DT[:, [fillna(f[0], reverse = False), + fillna(f[0], reverse = True)], by(f[0])] + DT_ref = dt.Frame([ + [None, 1, 1, 2, 2], + [None, 1, 1, 2, 2], + [None, 1, 1, 2, 2] + ]) + assert_equals(DT_bfill, DT_ref) + diff --git a/tests/test-f.py b/tests/test-f.py index 8aad9663b3..681618deed 100644 --- a/tests/test-f.py +++ b/tests/test-f.py @@ -463,3 +463,12 @@ def test_cumprod(): DT = dt.Frame(A = [9, 8, 2, 3, None, None, 3, 0, 5, 5, 8, None, 1]) assert_equals(DT[:, f.A.cumprod()], DT[:, dt.cumprod(f.A)]) + +def test_fillna(): + assert str(dt.fillna(f.A, reverse=False)) == str(f.A.fillna(reverse=False)) + assert str(dt.fillna(f.A, reverse=True)) == str(f.A.fillna(reverse=True)) + assert str(dt.fillna(f[:], True)) == str(f[:].fillna(True)) + DT = dt.Frame(A = [9, 8, 2, 3, None, None, 3, 0, 5, 5, 8, None, 1]) + assert_equals(DT[:, f.A.fillna(reverse=True)], DT[:, dt.fillna(f.A, True)]) + assert_equals(DT[:, f.A.fillna(reverse=False)], DT[:, dt.fillna(f.A, False)]) +