Skip to content

Commit

Permalink
Merge pull request #1596 from finos/regex_functions
Browse files Browse the repository at this point in the history
Add Regex functions using Exprtk and RE2
  • Loading branch information
texodus authored Oct 31, 2021
2 parents df330c2 + bd7e69f commit 7acc9b0
Show file tree
Hide file tree
Showing 31 changed files with 1,574 additions and 165 deletions.
16 changes: 16 additions & 0 deletions cmake/re2.txt.in
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
cmake_minimum_required(VERSION 3.7.2)

project(re2-download NONE)

include(ExternalProject)
ExternalProject_Add(re2
GIT_REPOSITORY https://github.com/google/re2.git
GIT_TAG 2021-09-01
SOURCE_DIR "${CMAKE_BINARY_DIR}/re2-src"
BINARY_DIR "${CMAKE_BINARY_DIR}/re2-build"
CONFIGURE_COMMAND ""
BUILD_COMMAND ""
INSTALL_COMMAND ""
TEST_COMMAND ""
CMAKE_ARGS "-DCMAKE_TOOLCHAIN_FILE=${CMAKE_TOOLCHAIN_FILE}"
)
9 changes: 7 additions & 2 deletions cpp/perspective/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -323,6 +323,7 @@ elseif(PSP_CPP_BUILD OR PSP_PYTHON_BUILD)
#########################
# PYTHON BINDINGS BUILD #
#########################
set(CMAKE_POSITION_INDEPENDENT_CODE ON)
include_directories("${PSP_PYTHON_SRC}/perspective/include")

# Set CMP0094 to NEW - find the first version that matches constraints,
Expand Down Expand Up @@ -426,6 +427,9 @@ endif()
# Build minimal arrow itself
psp_build_dep("arrow" "${PSP_CMAKE_MODULE_PATH}/arrow.txt.in")

# Build re2 as our regex library
psp_build_dep("re2" "${PSP_CMAKE_MODULE_PATH}/re2.txt.in")

find_package(Flatbuffers)
if(NOT FLATBUFFERS_FOUND)
message(FATAL_ERROR"${Red}Flatbuffers could not be located${ColorReset}")
Expand Down Expand Up @@ -508,6 +512,7 @@ set (SOURCE_FILES
${PSP_CPP_SRC}/src/cpp/raii_impl_osx.cpp
${PSP_CPP_SRC}/src/cpp/raii_impl_win.cpp
${PSP_CPP_SRC}/src/cpp/range.cpp
${PSP_CPP_SRC}/src/cpp/regex.cpp
${PSP_CPP_SRC}/src/cpp/rlookup.cpp
${PSP_CPP_SRC}/src/cpp/scalar.cpp
${PSP_CPP_SRC}/src/cpp/schema_column.cpp
Expand Down Expand Up @@ -580,7 +585,7 @@ if (PSP_WASM_BUILD)
add_library(psp ${WASM_SOURCE_FILES})
target_compile_definitions(psp PRIVATE PSP_ENABLE_WASM=1)
set_target_properties(psp PROPERTIES COMPILE_FLAGS "")
target_link_libraries(psp arrow)
target_link_libraries(psp arrow re2)

# "esm/erspective.cpp.js" from CMAKE_EXECUTABLE_SYNTAX
add_executable(perspective_esm src/cpp/emscripten.cpp)
Expand Down Expand Up @@ -652,7 +657,7 @@ elseif(PSP_CPP_BUILD OR PSP_PYTHON_BUILD)
endif()

# Link against minimal arrow static library
target_link_libraries(psp arrow)
target_link_libraries(psp arrow re2)

target_link_libraries(binding psp)

Expand Down
92 changes: 48 additions & 44 deletions cpp/perspective/src/cpp/arrow_csv.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,47 +20,55 @@
#include <arrow/csv/reader.h>
#endif


template <class TimePoint>
static inline arrow::TimestampType::c_type ConvertTimePoint(TimePoint tp, arrow::TimeUnit::type unit) {
auto duration = tp.time_since_epoch();
switch (unit) {
case arrow::TimeUnit::SECOND:
return std::chrono::duration_cast<std::chrono::seconds>(duration).count();
case arrow::TimeUnit::MILLI:
return std::chrono::duration_cast<std::chrono::milliseconds>(duration).count();
case arrow::TimeUnit::MICRO:
return std::chrono::duration_cast<std::chrono::microseconds>(duration).count();
case arrow::TimeUnit::NANO:
return std::chrono::duration_cast<std::chrono::nanoseconds>(duration).count();
default:
// Compiler errors without default case even though all enum cases are handled
assert(0);
return 0;
}
static inline arrow::TimestampType::c_type
ConvertTimePoint(TimePoint tp, arrow::TimeUnit::type unit) {
auto duration = tp.time_since_epoch();
switch (unit) {
case arrow::TimeUnit::SECOND:
return std::chrono::duration_cast<std::chrono::seconds>(duration)
.count();
case arrow::TimeUnit::MILLI:
return std::chrono::duration_cast<std::chrono::milliseconds>(
duration)
.count();
case arrow::TimeUnit::MICRO:
return std::chrono::duration_cast<std::chrono::microseconds>(
duration)
.count();
case arrow::TimeUnit::NANO:
return std::chrono::duration_cast<std::chrono::nanoseconds>(
duration)
.count();
default:
// Compiler errors without default case even though all enum cases
// are handled
assert(0);
return 0;
}
}


static inline bool ParseYYYY_MM_DD(const char* s,
arrow_vendored::date::year_month_day* out) {
uint16_t year = 0;
uint8_t month = 0;
uint8_t day = 0;
if (ARROW_PREDICT_FALSE(s[4] != '-') || ARROW_PREDICT_FALSE(s[7] != '-')) {
return false;
}
if (ARROW_PREDICT_FALSE(!arrow::internal::ParseUnsigned(s + 0, 4, &year))) {
return false;
}
if (ARROW_PREDICT_FALSE(!arrow::internal::ParseUnsigned(s + 5, 2, &month))) {
return false;
}
if (ARROW_PREDICT_FALSE(!arrow::internal::ParseUnsigned(s + 8, 2, &day))) {
return false;
}
*out = {arrow_vendored::date::year{year}, arrow_vendored::date::month{month},
arrow_vendored::date::day{day}};
return out->ok();
static inline bool
ParseYYYY_MM_DD(const char* s, arrow_vendored::date::year_month_day* out) {
uint16_t year = 0;
uint8_t month = 0;
uint8_t day = 0;
if (ARROW_PREDICT_FALSE(s[4] != '-') || ARROW_PREDICT_FALSE(s[7] != '-')) {
return false;
}
if (ARROW_PREDICT_FALSE(!arrow::internal::ParseUnsigned(s + 0, 4, &year))) {
return false;
}
if (ARROW_PREDICT_FALSE(
!arrow::internal::ParseUnsigned(s + 5, 2, &month))) {
return false;
}
if (ARROW_PREDICT_FALSE(!arrow::internal::ParseUnsigned(s + 8, 2, &day))) {
return false;
}
*out = {arrow_vendored::date::year{year},
arrow_vendored::date::month{month}, arrow_vendored::date::day{day}};
return out->ok();
}

namespace perspective {
Expand Down Expand Up @@ -143,9 +151,7 @@ namespace apachearrow {
if (length == 23) {
// "YYYY-MM-DD[ T]hh:mm:ss.sss"
arrow_vendored::date::year_month_day ymd;
if (ARROW_PREDICT_FALSE(
!ParseYYYY_MM_DD(
s, &ymd))) {
if (ARROW_PREDICT_FALSE(!ParseYYYY_MM_DD(s, &ymd))) {
return false;
}
std::chrono::seconds seconds;
Expand All @@ -166,9 +172,7 @@ namespace apachearrow {
} else if (length == 25) {
// "2008-09-15[ T]15:53:00+05:00"
arrow_vendored::date::year_month_day ymd;
if (ARROW_PREDICT_FALSE(
!ParseYYYY_MM_DD(
s, &ymd))) {
if (ARROW_PREDICT_FALSE(!ParseYYYY_MM_DD(s, &ymd))) {
return false;
}
std::chrono::seconds seconds;
Expand Down
49 changes: 34 additions & 15 deletions cpp/perspective/src/cpp/computed_expression.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,8 @@ t_computed_expression::t_computed_expression(

void
t_computed_expression::compute(std::shared_ptr<t_data_table> source_table,
std::shared_ptr<t_data_table> destination_table, t_expression_vocab& vocab) const {
std::shared_ptr<t_data_table> destination_table, t_expression_vocab& vocab,
t_regex_mapping& regex_mapping) const {
// TODO: share symtables across pre/re/compute
exprtk::symbol_table<t_tscalar> sym_table;

Expand All @@ -100,7 +101,7 @@ t_computed_expression::compute(std::shared_ptr<t_data_table> source_table,

// Create a function store, with is_type_validator set to false as we
// are calculating values, not type-checking.
t_computed_function_store function_store(vocab, false);
t_computed_function_store function_store(vocab, regex_mapping, false);
function_store.register_computed_functions(sym_table);

exprtk::expression<t_tscalar> expr_definition;
Expand Down Expand Up @@ -209,13 +210,14 @@ t_computed_expression_parser::precompute(const std::string& expression_alias,
const std::string& expression_string,
const std::string& parsed_expression_string,
const std::vector<std::pair<std::string, std::string>>& column_ids,
std::shared_ptr<t_schema> schema, t_expression_vocab& vocab) {
std::shared_ptr<t_schema> schema, t_expression_vocab& vocab,
t_regex_mapping& regex_mapping) {
exprtk::symbol_table<t_tscalar> sym_table;
sym_table.add_constants();

// Create a function store, with is_type_validator set to true as we are
// just getting the output types.
t_computed_function_store function_store(vocab, true);
t_computed_function_store function_store(vocab, regex_mapping, true);
function_store.register_computed_functions(sym_table);

std::vector<t_tscalar> values;
Expand Down Expand Up @@ -271,15 +273,16 @@ t_computed_expression_parser::get_dtype(const std::string& expression_alias,
const std::string& expression_string,
const std::string& parsed_expression_string,
const std::vector<std::pair<std::string, std::string>>& column_ids,
const t_schema& schema, t_expression_error& error, t_expression_vocab& vocab) {
const t_schema& schema, t_expression_error& error,
t_expression_vocab& vocab, t_regex_mapping& regex_mapping) {
exprtk::symbol_table<t_tscalar> sym_table;
sym_table.add_constants();

std::vector<t_tscalar> values;

// Create a function store, with is_type_validator set to true as we are
// just validating the output types.
t_computed_function_store function_store(vocab, true);
t_computed_function_store function_store(vocab, regex_mapping, true);
function_store.register_computed_functions(sym_table);

auto num_input_columns = column_ids.size();
Expand Down Expand Up @@ -418,8 +421,8 @@ t_validated_expression_map::get_expression_errors() const {
return m_expression_errors;
}

t_computed_function_store::t_computed_function_store(
t_expression_vocab& vocab, bool is_type_validator)
t_computed_function_store::t_computed_function_store(t_expression_vocab& vocab,
t_regex_mapping& regex_mapping, bool is_type_validator)
: m_day_of_week_fn(computed_function::day_of_week(vocab, is_type_validator))
, m_month_of_year_fn(
computed_function::month_of_year(vocab, is_type_validator))
Expand All @@ -428,34 +431,44 @@ t_computed_function_store::t_computed_function_store(
, m_order_fn(computed_function::order(is_type_validator))
, m_upper_fn(computed_function::upper(vocab, is_type_validator))
, m_lower_fn(computed_function::lower(vocab, is_type_validator))
, m_to_string_fn(computed_function::to_string(vocab, is_type_validator)) {}
, m_to_string_fn(computed_function::to_string(vocab, is_type_validator))
, m_match_fn(computed_function::match(regex_mapping))
, m_fullmatch_fn(computed_function::fullmatch(regex_mapping))
, m_search_fn(
computed_function::search(vocab, regex_mapping, is_type_validator)) {}

void
t_computed_function_store::register_computed_functions(
exprtk::symbol_table<t_tscalar>& sym_table) {
// General/numeric functions
sym_table.add_function("bucket", t_computed_expression_parser::BUCKET_FN);
sym_table.add_reserved_function(
"inrange", t_computed_expression_parser::INRANGE_FN);
sym_table.add_reserved_function(
"min", t_computed_expression_parser::MIN_FN);
sym_table.add_reserved_function(
"max", t_computed_expression_parser::MAX_FN);
sym_table.add_function(
"percent_of", t_computed_expression_parser::PERCENT_OF_FN);
sym_table.add_function("is_null", t_computed_expression_parser::IS_NULL_FN);
sym_table.add_function(
"is_not_null", t_computed_expression_parser::IS_NOT_NULL_FN);

// Date/datetime functions
sym_table.add_function(
"hour_of_day", t_computed_expression_parser::HOUR_OF_DAY_FN);
sym_table.add_function("day_of_week", m_day_of_week_fn);
sym_table.add_function("month_of_year", m_month_of_year_fn);

// String functions
sym_table.add_function("intern", m_intern_fn);
sym_table.add_function("concat", m_concat_fn);
sym_table.add_function("order", m_order_fn);
sym_table.add_function("upper", m_upper_fn);
sym_table.add_function("lower", m_lower_fn);
sym_table.add_function("length", t_computed_expression_parser::LENGTH_FN);
sym_table.add_function("string", m_to_string_fn);
sym_table.add_function(
"percent_of", t_computed_expression_parser::PERCENT_OF_FN);
sym_table.add_function("is_null", t_computed_expression_parser::IS_NULL_FN);
sym_table.add_function(
"is_not_null", t_computed_expression_parser::IS_NOT_NULL_FN);

// Type conversion functions
sym_table.add_function(
"integer", t_computed_expression_parser::TO_INTEGER_FN);
sym_table.add_function("float", t_computed_expression_parser::TO_FLOAT_FN);
Expand All @@ -464,6 +477,12 @@ t_computed_function_store::register_computed_functions(
sym_table.add_function("date", t_computed_expression_parser::MAKE_DATE_FN);
sym_table.add_function(
"datetime", t_computed_expression_parser::MAKE_DATETIME_FN);
sym_table.add_function("string", m_to_string_fn);

// Regex functions
sym_table.add_function("match", m_match_fn);
sym_table.add_function("fullmatch", m_fullmatch_fn);
sym_table.add_function("search", m_search_fn);

// Register static free functions as well
sym_table.add_function("today", computed_function::today);
Expand Down
Loading

0 comments on commit 7acc9b0

Please sign in to comment.