Skip to content

Commit

Permalink
feat: implement bool_and and bool_or (#3754)
Browse files Browse the repository at this point in the history
# Description

This PR adds two sets of boolean functions:

1. List Boolean Functions:
- `list_bool_and`: Returns true if all non-null elements in a list are
true, false if any non-null element is false, and null if all elements
are null or the list is empty
- `list_bool_or`: Returns true if any non-null element in a list is
true, false if all non-null elements are false, and null if all elements
are null or the list is empty

2. Boolean Aggregation Functions:
- `bool_and()`: Returns true if all non-null values in a column are
true, false if any non-null value is false, and null if all values are
null
- `bool_or()`: Returns true if any non-null value in a column is true,
false if all non-null values are false, and null if all values are null

Both sets of functions handle nulls consistently - a single non-null
value determines the result unless all values are null.
  • Loading branch information
f4t4nt authored Feb 6, 2025
1 parent d40aef8 commit 1b5a077
Show file tree
Hide file tree
Showing 22 changed files with 909 additions and 3 deletions.
4 changes: 4 additions & 0 deletions daft/daft/__init__.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -947,6 +947,8 @@ class PyExpr:
def stddev(self) -> PyExpr: ...
def min(self) -> PyExpr: ...
def max(self) -> PyExpr: ...
def bool_and(self) -> PyExpr: ...
def bool_or(self) -> PyExpr: ...
def any_value(self, ignore_nulls: bool) -> PyExpr: ...
def agg_list(self) -> PyExpr: ...
def agg_concat(self) -> PyExpr: ...
Expand Down Expand Up @@ -1171,6 +1173,8 @@ def list_sum(expr: PyExpr) -> PyExpr: ...
def list_mean(expr: PyExpr) -> PyExpr: ...
def list_min(expr: PyExpr) -> PyExpr: ...
def list_max(expr: PyExpr) -> PyExpr: ...
def list_bool_and(expr: PyExpr) -> PyExpr: ...
def list_bool_or(expr: PyExpr) -> PyExpr: ...
def list_slice(expr: PyExpr, start: PyExpr, end: PyExpr | None = None) -> PyExpr: ...
def list_chunk(expr: PyExpr, size: int) -> PyExpr: ...

Expand Down
122 changes: 122 additions & 0 deletions daft/expressions/expressions.py
Original file line number Diff line number Diff line change
Expand Up @@ -1012,6 +1012,68 @@ def max(self) -> Expression:
expr = self._expr.max()
return Expression._from_pyexpr(expr)

def bool_and(self) -> Expression:
"""Calculates the boolean AND of all values in a list.
For each list:
- Returns True if all non-null values are True
- Returns False if any non-null value is False
- Returns null if the list is empty or contains only null values
Example:
>>> import daft
>>> df = daft.from_pydict({"values": [[True, True], [True, False], [None, None], []]})
>>> df.with_column("result", df["values"].list.bool_and()).collect()
╭───────────────┬─────────╮
│ values ┆ result │
│ --- ┆ --- │
│ List[Boolean] ┆ Boolean │
╞═══════════════╪═════════╡
│ [true, true] ┆ true │
├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤
│ [true, false] ┆ false │
├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤
│ [None, None] ┆ None │
├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤
│ [] ┆ None │
╰───────────────┴─────────╯
<BLANKLINE>
(Showing first 4 of 4 rows)
"""
expr = self._expr.bool_and()
return Expression._from_pyexpr(expr)

def bool_or(self) -> Expression:
"""Calculates the boolean OR of all values in a list.
For each list:
- Returns True if any non-null value is True
- Returns False if all non-null values are False
- Returns null if the list is empty or contains only null values
Example:
>>> import daft
>>> df = daft.from_pydict({"values": [[True, False], [False, False], [None, None], []]})
>>> df.with_column("result", df["values"].list.bool_or()).collect()
╭────────────────┬─────────╮
│ values ┆ result │
│ --- ┆ --- │
│ List[Boolean] ┆ Boolean │
╞════════════════╪═════════╡
│ [true, false] ┆ true │
├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤
│ [false, false] ┆ false │
├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤
│ [None, None] ┆ None │
├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤
│ [] ┆ None │
╰────────────────┴─────────╯
<BLANKLINE>
(Showing first 4 of 4 rows)
"""
expr = self._expr.bool_or()
return Expression._from_pyexpr(expr)

def any_value(self, ignore_nulls=False) -> Expression:
"""Returns any value in the expression.
Expand Down Expand Up @@ -3240,6 +3302,66 @@ def max(self) -> Expression:
"""
return Expression._from_pyexpr(native.list_max(self._expr))

def bool_and(self) -> Expression:
"""Calculates the boolean AND of all values in a list.
For each list:
- Returns True if all non-null values are True
- Returns False if any non-null value is False
- Returns null if the list is empty or contains only null values
Example:
>>> import daft
>>> df = daft.from_pydict({"values": [[True, True], [True, False], [None, None], []]})
>>> df.with_column("result", df["values"].list.bool_and()).collect()
╭───────────────┬─────────╮
│ values ┆ result │
│ --- ┆ --- │
│ List[Boolean] ┆ Boolean │
╞═══════════════╪═════════╡
│ [true, true] ┆ true │
├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤
│ [true, false] ┆ false │
├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤
│ [None, None] ┆ None │
├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤
│ [] ┆ None │
╰───────────────┴─────────╯
<BLANKLINE>
(Showing first 4 of 4 rows)
"""
return Expression._from_pyexpr(native.list_bool_and(self._expr))

def bool_or(self) -> Expression:
"""Calculates the boolean OR of all values in a list.
For each list:
- Returns True if any non-null value is True
- Returns False if all non-null values are False
- Returns null if the list is empty or contains only null values
Example:
>>> import daft
>>> df = daft.from_pydict({"values": [[True, False], [False, False], [None, None], []]})
>>> df.with_column("result", df["values"].list.bool_or()).collect()
╭────────────────┬─────────╮
│ values ┆ result │
│ --- ┆ --- │
│ List[Boolean] ┆ Boolean │
╞════════════════╪═════════╡
│ [true, false] ┆ true │
├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤
│ [false, false] ┆ false │
├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤
│ [None, None] ┆ None │
├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤
│ [] ┆ None │
╰────────────────┴─────────╯
<BLANKLINE>
(Showing first 4 of 4 rows)
"""
return Expression._from_pyexpr(native.list_bool_or(self._expr))

def sort(self, desc: bool | Expression = False, nulls_first: bool | Expression | None = None) -> Expression:
"""Sorts the inner lists of a list column.
Expand Down
4 changes: 4 additions & 0 deletions docs/sphinx/source/expressions.rst
Original file line number Diff line number Diff line change
Expand Up @@ -113,6 +113,8 @@ The following can be used with DataFrame.agg or GroupedDataFrame.agg
.. autosummary::
:toctree: doc_gen/expression_methods

Expression.bool_and
Expression.bool_or
Expression.count
Expression.sum
Expression.mean
Expand Down Expand Up @@ -233,6 +235,8 @@ List
:toctree: doc_gen/expression_methods
:template: autosummary/accessor_method.rst

Expression.list.bool_and
Expression.list.bool_or
Expression.list.chunk
Expression.list.count
Expression.list.get
Expand Down
134 changes: 134 additions & 0 deletions src/daft-core/src/array/ops/bool_agg.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,134 @@
use arrow2::array::{Array, BooleanArray};
use common_error::DaftResult;

use crate::{
array::{
ops::{DaftBoolAggable, GroupIndices},
DataArray,
},
datatypes::BooleanType,
};

impl DaftBoolAggable for DataArray<BooleanType> {
type Output = DaftResult<Self>;

fn bool_and(&self) -> Self::Output {
let array = self.data();
let array = array.as_any().downcast_ref::<BooleanArray>().unwrap();

// If array is empty or all null, return null
if array.null_count() == array.len() {
return Ok(Self::from((
self.field.name.as_ref(),
Box::new(BooleanArray::from_iter(std::iter::once(None))),
)));
}

// Look for first non-null false value
let mut result = true;
for i in 0..array.len() {
if !array.is_null(i) && !array.value(i) {
result = false;
break;
}
}

Ok(Self::from((
self.field.name.as_ref(),
Box::new(BooleanArray::from_iter(std::iter::once(Some(result)))),
)))
}

fn bool_or(&self) -> Self::Output {
let array = self.data();
let array = array.as_any().downcast_ref::<BooleanArray>().unwrap();

// If array is empty or all null, return null
if array.null_count() == array.len() {
return Ok(Self::from((
self.field.name.as_ref(),
Box::new(BooleanArray::from_iter(std::iter::once(None))),
)));
}

// Look for first non-null true value
let mut result = false;
for i in 0..array.len() {
if !array.is_null(i) && array.value(i) {
result = true;
break;
}
}

Ok(Self::from((
self.field.name.as_ref(),
Box::new(BooleanArray::from_iter(std::iter::once(Some(result)))),
)))
}

fn grouped_bool_and(&self, groups: &GroupIndices) -> Self::Output {
let array = self.data();
let array = array.as_any().downcast_ref::<BooleanArray>().unwrap();
let mut results = Vec::with_capacity(groups.len());

for group in groups {
if group.is_empty() {
results.push(None);
continue;
}

let mut all_null = true;
let mut result = true;

for &idx in group {
if !array.is_null(idx as usize) {
all_null = false;
if !array.value(idx as usize) {
result = false;
break;
}
}
}

results.push(if all_null { None } else { Some(result) });
}

Ok(Self::from((
self.field.name.as_ref(),
Box::new(BooleanArray::from_iter(results)),
)))
}

fn grouped_bool_or(&self, groups: &GroupIndices) -> Self::Output {
let array = self.data();
let array = array.as_any().downcast_ref::<BooleanArray>().unwrap();
let mut results = Vec::with_capacity(groups.len());

for group in groups {
if group.is_empty() {
results.push(None);
continue;
}

let mut all_null = true;
let mut result = false;

for &idx in group {
if !array.is_null(idx as usize) {
all_null = false;
if array.value(idx as usize) {
result = true;
break;
}
}
}

results.push(if all_null { None } else { Some(result) });
}

Ok(Self::from((
self.field.name.as_ref(),
Box::new(BooleanArray::from_iter(results)),
)))
}
}
Loading

0 comments on commit 1b5a077

Please sign in to comment.