Skip to content

Commit

Permalink
Merge pull request #676 from finos/remove-nan-filtering-add-null-filt…
Browse files Browse the repository at this point in the history
…ering

Remove nan filtering, add null filtering
  • Loading branch information
texodus authored Aug 2, 2019
2 parents e03dcd4 + 7c38a9a commit 8cbd0fd
Show file tree
Hide file tree
Showing 30 changed files with 217 additions and 134 deletions.
16 changes: 8 additions & 8 deletions cpp/perspective/src/cpp/base.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -295,11 +295,11 @@ filter_op_to_str(t_filter_op op) {
case FILTER_OP_AND: {
return "and";
} break;
case FILTER_OP_IS_NAN: {
return "is_nan";
case FILTER_OP_IS_NULL: {
return "is null";
} break;
case FILTER_OP_IS_NOT_NAN: {
return "!is_nan";
case FILTER_OP_IS_NOT_NULL: {
return "is not null";
} break;
case FILTER_OP_IS_VALID: {
return "is not None";
Expand Down Expand Up @@ -340,10 +340,10 @@ str_to_filter_op(std::string str) {
return t_filter_op::FILTER_OP_AND;
} else if (str == "|") {
return t_filter_op::FILTER_OP_OR;
} else if (str == "is nan" || str == "is_nan") {
return t_filter_op::FILTER_OP_IS_NAN;
} else if (str == "is not nan" || str == "!is_nan") {
return t_filter_op::FILTER_OP_IS_NOT_NAN;
} else if (str == "is null") {
return t_filter_op::FILTER_OP_IS_NULL;
} else if (str == "is not null") {
return t_filter_op::FILTER_OP_IS_NOT_NULL;
} else if (str == "is not None") {
return t_filter_op::FILTER_OP_IS_VALID;
} else if (str == "is None") {
Expand Down
10 changes: 0 additions & 10 deletions cpp/perspective/src/cpp/config.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,6 @@ t_config::t_config(const std::vector<std::string>& row_pivots,
, m_totals(TOTALS_BEFORE)
, m_combiner(combiner)
, m_fterms(fterms)
, m_handle_nan_sort(true)
, m_fmode(FMODE_SIMPLE_CLAUSES) {
for (const auto& p : row_pivots) {
m_row_pivots.push_back(t_pivot(p));
Expand All @@ -67,7 +66,6 @@ t_config::t_config(const std::vector<std::string>& row_pivots,
, m_totals(totals)
, m_combiner(combiner)
, m_fterms(fterms)
, m_handle_nan_sort(true)
, m_fmode(FMODE_SIMPLE_CLAUSES) {
for (const auto& p : row_pivots) {
m_row_pivots.push_back(t_pivot(p));
Expand All @@ -90,7 +88,6 @@ t_config::t_config(const std::vector<std::string>& row_pivots,
, m_totals(totals)
, m_combiner(combiner)
, m_fterms(fterms)
, m_handle_nan_sort(true)
, m_fmode(FMODE_SIMPLE_CLAUSES) {
for (const auto& p : row_pivots) {
m_row_pivots.push_back(t_pivot(p));
Expand All @@ -116,7 +113,6 @@ t_config::t_config(
: m_aggregates(aggregates)
, m_totals(TOTALS_BEFORE)
, m_combiner(FILTER_OP_AND)
, m_handle_nan_sort(true)
, m_fmode(FMODE_SIMPLE_CLAUSES) {
for (const auto& p : row_pivots) {
m_row_pivots.push_back(t_pivot(p));
Expand All @@ -129,7 +125,6 @@ t_config::t_config(const std::vector<std::string>& row_pivots, const t_aggspec&
: m_aggregates(std::vector<t_aggspec>{agg})
, m_totals(TOTALS_BEFORE)
, m_combiner(FILTER_OP_AND)
, m_handle_nan_sort(true)
, m_fmode(FMODE_SIMPLE_CLAUSES) {
for (const auto& p : row_pivots) {
m_row_pivots.push_back(t_pivot(p));
Expand Down Expand Up @@ -374,11 +369,6 @@ t_config::get_pivots() const {
return rval;
}

bool
t_config::handle_nan_sort() const {
return m_handle_nan_sort;
}

std::string
t_config::get_parent_pkey_column() const {
return m_parent_pkey_column;
Expand Down
9 changes: 3 additions & 6 deletions cpp/perspective/src/cpp/context_grouped_pkey.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -42,8 +42,7 @@ t_ctx_grouped_pkey::init() {
auto pivots = m_config.get_row_pivots();
m_tree = std::make_shared<t_stree>(pivots, m_config.get_aggregates(), m_schema, m_config);
m_tree->init();
m_traversal
= std::shared_ptr<t_traversal>(new t_traversal(m_tree, m_config.handle_nan_sort()));
m_traversal = std::shared_ptr<t_traversal>(new t_traversal(m_tree));
m_minmax = std::vector<t_minmax>(m_config.get_num_aggregates());
m_init = true;
}
Expand Down Expand Up @@ -445,8 +444,7 @@ t_ctx_grouped_pkey::reset() {
m_tree = std::make_shared<t_stree>(pivots, m_config.get_aggregates(), m_schema, m_config);
m_tree->init();
m_tree->set_deltas_enabled(get_feature_state(CTX_FEAT_DELTA));
m_traversal
= std::shared_ptr<t_traversal>(new t_traversal(m_tree, m_config.handle_nan_sort()));
m_traversal = std::shared_ptr<t_traversal>(new t_traversal(m_tree));
}

void
Expand Down Expand Up @@ -661,8 +659,7 @@ t_ctx_grouped_pkey::rebuild() {
);
#endif

m_traversal
= std::shared_ptr<t_traversal>(new t_traversal(m_tree, m_config.handle_nan_sort()));
m_traversal = std::shared_ptr<t_traversal>(new t_traversal(m_tree));

set_expansion_state(expansion_state);

Expand Down
6 changes: 2 additions & 4 deletions cpp/perspective/src/cpp/context_one.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -33,8 +33,7 @@ t_ctx1::init() {
auto pivots = m_config.get_row_pivots();
m_tree = std::make_shared<t_stree>(pivots, m_config.get_aggregates(), m_schema, m_config);
m_tree->init();
m_traversal
= std::shared_ptr<t_traversal>(new t_traversal(m_tree, m_config.handle_nan_sort()));
m_traversal = std::shared_ptr<t_traversal>(new t_traversal(m_tree));
m_minmax = std::vector<t_minmax>(m_config.get_num_aggregates());
m_init = true;
}
Expand Down Expand Up @@ -482,8 +481,7 @@ t_ctx1::reset() {
m_tree = std::make_shared<t_stree>(pivots, m_config.get_aggregates(), m_schema, m_config);
m_tree->init();
m_tree->set_deltas_enabled(get_feature_state(CTX_FEAT_DELTA));
m_traversal
= std::shared_ptr<t_traversal>(new t_traversal(m_tree, m_config.handle_nan_sort()));
m_traversal = std::shared_ptr<t_traversal>(new t_traversal(m_tree));
}

void
Expand Down
8 changes: 4 additions & 4 deletions cpp/perspective/src/cpp/context_two.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -66,9 +66,9 @@ t_ctx2::init() {
m_trees[treeidx]->init();
}

m_rtraversal = std::make_shared<t_traversal>(rtree(), m_config.handle_nan_sort());
m_rtraversal = std::make_shared<t_traversal>(rtree());

m_ctraversal = std::make_shared<t_traversal>(ctree(), m_config.handle_nan_sort());
m_ctraversal = std::make_shared<t_traversal>(ctree());
m_minmax = std::vector<t_minmax>(m_config.get_num_aggregates());
m_init = true;
}
Expand Down Expand Up @@ -806,8 +806,8 @@ t_ctx2::reset() {
m_trees[treeidx]->set_deltas_enabled(get_feature_state(CTX_FEAT_DELTA));
}

m_rtraversal = std::make_shared<t_traversal>(rtree(), m_config.handle_nan_sort());
m_ctraversal = std::make_shared<t_traversal>(ctree(), m_config.handle_nan_sort());
m_rtraversal = std::make_shared<t_traversal>(rtree());
m_ctraversal = std::make_shared<t_traversal>(ctree());
}

bool
Expand Down
2 changes: 1 addition & 1 deletion cpp/perspective/src/cpp/context_zero.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -199,7 +199,7 @@ t_ctx0::get_column_name(t_index idx) {

void
t_ctx0::init() {
m_traversal = std::make_shared<t_ftrav>(m_config.handle_nan_sort());
m_traversal = std::make_shared<t_ftrav>();
m_deltas = std::make_shared<t_zcdeltas>();
m_init = true;
}
Expand Down
2 changes: 1 addition & 1 deletion cpp/perspective/src/cpp/data_table.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -443,7 +443,7 @@ t_data_table::filter_cpp(t_filter_op combiner, const std::vector<t_fterm>& fterm
tval = ft(cell_val);
}

if (!cell_val.is_valid() || !tval) {
if ((ft.m_op != FILTER_OP_IS_NULL && !cell_val.is_valid()) || !tval) {
pass = false;
break;
}
Expand Down
18 changes: 14 additions & 4 deletions cpp/perspective/src/cpp/emscripten.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1300,8 +1300,14 @@ namespace binding {

template <>
bool
is_valid_filter(t_dtype type, t_val date_parser, t_val filter_term) {
if (type == DTYPE_DATE || type == DTYPE_TIME) {
is_valid_filter(t_dtype type, t_val date_parser, t_val filter_term, t_val filter_operand) {
std::string comp_str = filter_operand.as<std::string>();
t_filter_op comp = str_to_filter_op(comp_str);

if (comp == t_filter_op::FILTER_OP_IS_NULL
|| comp == t_filter_op::FILTER_OP_IS_NOT_NULL) {
return true;
} else if (type == DTYPE_DATE || type == DTYPE_TIME) {
t_val parsed_date = date_parser.call<t_val>("parse", filter_term);
return has_value(parsed_date);
} else {
Expand All @@ -1326,6 +1332,10 @@ namespace binding {
terms.push_back(mktscalar(get_interned_cstr(term.c_str())));
}
} break;
case FILTER_OP_IS_NULL:
case FILTER_OP_IS_NOT_NULL: {
terms.push_back(mktscalar(0));
} break;
default: {
switch (type) {
case DTYPE_INT32: {
Expand Down Expand Up @@ -1394,7 +1404,7 @@ namespace binding {
t_dtype type = schema.get_dtype(f[0].as<std::string>());

// validate the filter before it goes into the core engine
if (is_valid_filter(type, date_parser, f[2])) {
if (is_valid_filter(type, date_parser, f[2], f[1])) {
filter.push_back(make_filter_term(type, date_parser, f));
}
}
Expand Down Expand Up @@ -1926,4 +1936,4 @@ EMSCRIPTEN_BINDINGS(perspective) {
function("get_from_data_slice_one", &get_from_data_slice<t_ctx1>, allow_raw_pointers());
function("get_data_slice_two", &get_data_slice<t_ctx2>, allow_raw_pointers());
function("get_from_data_slice_two", &get_from_data_slice<t_ctx2>, allow_raw_pointers());
}
}
11 changes: 5 additions & 6 deletions cpp/perspective/src/cpp/flat_traversal.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,10 +19,9 @@

namespace perspective {

t_ftrav::t_ftrav(bool handle_nan_sort)
t_ftrav::t_ftrav()
: m_step_deletes(0)
, m_step_inserts(0)
, m_handle_nan_sort(handle_nan_sort) {
, m_step_inserts(0) {
m_index = std::make_shared<std::vector<t_mselem>>();
}

Expand Down Expand Up @@ -140,7 +139,7 @@ t_ftrav::sort_by(std::shared_ptr<const t_gstate> state, const t_config& config,
const std::vector<t_sortspec>& sortby) {
if (sortby.empty())
return;
t_multisorter sorter(get_sort_orders(sortby), m_handle_nan_sort);
t_multisorter sorter(get_sort_orders(sortby));
t_index size = m_index->size();
auto sort_elems = std::make_shared<std::vector<t_mselem>>(static_cast<size_t>(size));
m_sortby = sortby;
Expand Down Expand Up @@ -271,7 +270,7 @@ t_ftrav::step_end() {
}
}
std::swap(new_index, m_index);
t_multisorter sorter(get_sort_orders(m_sortby), m_handle_nan_sort);
t_multisorter sorter(get_sort_orders(m_sortby));
std::sort(m_index->begin(), m_index->end(), sorter);

m_pkeyidx.clear();
Expand Down Expand Up @@ -337,7 +336,7 @@ t_ftrav::reset_step_state() {
t_uindex
t_ftrav::lower_bound_row_idx(std::shared_ptr<const t_gstate> state, const t_config& config,
const std::vector<t_tscalar>& row) const {
t_multisorter sorter(get_sort_orders(m_sortby), m_handle_nan_sort);
t_multisorter sorter(get_sort_orders(m_sortby));
t_mselem target_val;

fill_sort_elem(state, config, row, target_val);
Expand Down
14 changes: 6 additions & 8 deletions cpp/perspective/src/cpp/multi_sort.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -200,19 +200,17 @@ nan_compare(t_sorttype order, const t_tscalar& a, const t_tscalar& b) {
return rval;
}

t_multisorter::t_multisorter(const std::vector<t_sorttype>& order, bool handle_nans)
: m_sort_order(order)
, m_handle_nans(handle_nans) {}
t_multisorter::t_multisorter(const std::vector<t_sorttype>& order)
: m_sort_order(order) {}

t_multisorter::t_multisorter(std::shared_ptr<const std::vector<t_mselem>> elems,
const std::vector<t_sorttype>& order, bool handle_nans)
t_multisorter::t_multisorter(
std::shared_ptr<const std::vector<t_mselem>> elems, const std::vector<t_sorttype>& order)
: m_sort_order(order)
, m_elems(elems)
, m_handle_nans(handle_nans) {}
, m_elems(elems) {}

bool
t_multisorter::operator()(const t_mselem& a, const t_mselem& b) const {
return cmp_mselem(a, b, m_sort_order, m_handle_nans);
return cmp_mselem(a, b, m_sort_order);
}

bool
Expand Down
8 changes: 4 additions & 4 deletions cpp/perspective/src/cpp/scalar.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1085,11 +1085,11 @@ t_tscalar::cmp(t_filter_op op, const t_tscalar& other) const {
case FILTER_OP_CONTAINS: {
return value.contains(other);
} break;
case FILTER_OP_IS_NAN: {
return std::isnan(to_double());
case FILTER_OP_IS_NULL: {
return m_status != STATUS_VALID;
} break;
case FILTER_OP_IS_NOT_NAN: {
return !std::isnan(to_double());
case FILTER_OP_IS_NOT_NULL: {
return m_status == STATUS_VALID;
} break;
case FILTER_OP_IS_VALID: {
return m_status == STATUS_VALID;
Expand Down
6 changes: 3 additions & 3 deletions cpp/perspective/src/cpp/sparse_tree.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -854,7 +854,6 @@ t_stree::gen_aggidx() {
void
t_stree::update_agg_table(t_uindex nidx, t_agg_update_info& info, t_uindex src_ridx,
t_uindex dst_ridx, t_index nstrands, const t_gstate& gstate) {
static bool const enable_sticky_nan_fix = true;
for (t_uindex idx : info.m_dst_topo_sorted) {
const t_column* src = info.m_src[idx];
t_column* dst = info.m_dst[idx];
Expand All @@ -870,8 +869,7 @@ t_stree::update_agg_table(t_uindex nidx, t_agg_update_info& info, t_uindex src_r
t_tscalar dst_scalar = dst->get_scalar(dst_ridx);
old_value.set(dst_scalar);
new_value.set(dst_scalar.add(src_scalar));
if (enable_sticky_nan_fix
&& old_value.is_nan()) // is_nan returns false for non-float types
if (old_value.is_nan()) // is_nan returns false for non-float types
{
// if we previously had a NaN, add can't make it finite again; recalculate
// entire sum in case it is now finite
Expand Down Expand Up @@ -1162,11 +1160,13 @@ t_stree::update_agg_table(t_uindex nidx, t_agg_update_info& info, t_uindex src_r
t_tscalar rval;
rval.set(std::uint64_t(0));
rval.m_type = values[0].m_type;

for (const auto& v : values) {
if (v.is_nan())
continue;
rval = rval.add(v);
}

return rval;
}));
dst->set_scalar(dst_ridx, new_value);
Expand Down
2 changes: 1 addition & 1 deletion cpp/perspective/src/cpp/table.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -190,4 +190,4 @@ Table::process_index_column(t_data_table& data_table) {
}
}

} // namespace perspective
} // namespace perspective
7 changes: 3 additions & 4 deletions cpp/perspective/src/cpp/traversal.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -31,9 +31,8 @@ t_vdnode::t_vdnode(bool expanded, bool has_children)
: m_expanded(expanded)
, m_has_children(has_children) {}

t_traversal::t_traversal(std::shared_ptr<const t_stree> tree, bool handle_nan_sort)
: m_tree(tree)
, m_handle_nan_sort(handle_nan_sort) {
t_traversal::t_traversal(std::shared_ptr<const t_stree> tree)
: m_tree(tree) {
t_stnode_vec rchildren;
tree->get_child_nodes(0, rchildren);
populate_root_children(rchildren);
Expand Down Expand Up @@ -153,7 +152,7 @@ t_traversal::expand_node(const std::vector<t_sortspec>& sortby, t_index exp_idx,
}

std::vector<t_sorttype> sort_orders = get_sort_orders(sortby);
t_multisorter sorter(sortelems, sort_orders, m_handle_nan_sort);
t_multisorter sorter(sortelems, sort_orders);
argsort(sorted_idx, sorter);
} else {
for (t_index i = 0, loop_end = sorted_idx.size(); i != loop_end; ++i)
Expand Down
4 changes: 2 additions & 2 deletions cpp/perspective/src/include/perspective/base.h
Original file line number Diff line number Diff line change
Expand Up @@ -183,8 +183,8 @@ enum t_filter_op {
FILTER_OP_IN,
FILTER_OP_NOT_IN,
FILTER_OP_AND,
FILTER_OP_IS_NAN,
FILTER_OP_IS_NOT_NAN,
FILTER_OP_IS_NULL,
FILTER_OP_IS_NOT_NULL,
FILTER_OP_IS_VALID,
FILTER_OP_IS_NOT_VALID
};
Expand Down
Loading

0 comments on commit 8cbd0fd

Please sign in to comment.