diff --git a/cpp/src/gandiva/function_registry_string.cc b/cpp/src/gandiva/function_registry_string.cc index 3ea426c85f489..0830f7b2e114a 100644 --- a/cpp/src/gandiva/function_registry_string.cc +++ b/cpp/src/gandiva/function_registry_string.cc @@ -404,6 +404,21 @@ std::vector GetStringFunctionRegistry() { kResultNullIfNull, "gdv_fn_castVARBINARY_float64_int64", NativeFunction::kNeedsContext | NativeFunction::kCanReturnErrors), + NativeFunction("to_hex", {"hex"}, DataTypeVector{binary()}, utf8(), + kResultNullIfNull, "to_hex_binary", NativeFunction::kNeedsContext), + + NativeFunction("to_hex", {"hex"}, DataTypeVector{utf8()}, utf8(), kResultNullIfNull, + "to_hex_binary", NativeFunction::kNeedsContext), + + NativeFunction("to_hex", {"hex"}, DataTypeVector{int64()}, utf8(), + kResultNullIfNull, "to_hex_int64", NativeFunction::kNeedsContext), + + NativeFunction("to_hex", {"hex"}, DataTypeVector{int32()}, utf8(), + kResultNullIfNull, "to_hex_int32", NativeFunction::kNeedsContext), + + NativeFunction("from_hex", {"unhex"}, DataTypeVector{utf8()}, binary(), + kResultNullIfNull, "from_hex_utf8", NativeFunction::kNeedsContext), + NativeFunction("split_part", {}, DataTypeVector{utf8(), utf8(), int32()}, utf8(), kResultNullIfNull, "split_part", NativeFunction::kNeedsContext | NativeFunction::kCanReturnErrors)}; diff --git a/cpp/src/gandiva/precompiled/string_ops.cc b/cpp/src/gandiva/precompiled/string_ops.cc index 48c24b862b8fe..2020c3936fd83 100644 --- a/cpp/src/gandiva/precompiled/string_ops.cc +++ b/cpp/src/gandiva/precompiled/string_ops.cc @@ -16,11 +16,13 @@ // under the License. // String functions +#include "arrow/util/logging.h" #include "arrow/util/value_parsing.h" extern "C" { #include +#include #include #include #include @@ -2195,4 +2197,118 @@ const char* byte_substr_binary_int32_int32(gdv_int64 context, const char* text, memcpy(ret, text + startPos, *out_len); return ret; } + +// Gets a binary object and returns its hexadecimal representation. That representation +// maps each byte in the input to a 2-length string containing a hexadecimal number. +// - Examples: +// - foo -> 666F6F = 66[f] 6F[o] 6F[o] +// - bar -> 626172 = 62[b] 61[a] 72[r] +FORCE_INLINE +const char* to_hex_binary(int64_t context, const char* text, int32_t text_len, + int32_t* out_len) { + if (text_len == 0) { + *out_len = 0; + return ""; + } + + auto ret = + reinterpret_cast(gdv_fn_context_arena_malloc(context, text_len * 2 + 1)); + + if (ret == nullptr) { + gdv_fn_context_set_error_msg(context, "Could not allocate memory for output string"); + *out_len = 0; + return ""; + } + + uint32_t ret_index = 0; + uint32_t max_len = static_cast(text_len) * 2; + uint32_t max_char_to_write = 4; + + for (gdv_int32 i = 0; i < text_len; i++) { + DCHECK(ret_index >= 0 && ret_index < max_len); + + int32_t ch = static_cast(text[i]) & 0xFF; + + ret_index += snprintf(ret + ret_index, max_char_to_write, "%02X", ch); + } + + *out_len = static_cast(ret_index); + return ret; +} + +FORCE_INLINE +const char* to_hex_int64(int64_t context, int64_t data, int32_t* out_len) { + const int64_t hex_long_max_size = 2 * sizeof(int64_t); + auto ret = + reinterpret_cast(gdv_fn_context_arena_malloc(context, hex_long_max_size)); + + if (ret == nullptr) { + gdv_fn_context_set_error_msg(context, "Could not allocate memory for output string"); + *out_len = 0; + return ""; + } + snprintf(ret, hex_long_max_size + 1, "%" PRIX64, data); + + *out_len = static_cast(strlen(ret)); + return ret; +} + +FORCE_INLINE +const char* to_hex_int32(int64_t context, int32_t data, int32_t* out_len) { + const int32_t max_size = 2 * sizeof(int32_t); + auto ret = reinterpret_cast(gdv_fn_context_arena_malloc(context, max_size)); + + if (ret == nullptr) { + gdv_fn_context_set_error_msg(context, "Could not allocate memory for output string"); + *out_len = 0; + return ""; + } + snprintf(ret, max_size + 1, "%" PRIX32, data); + + *out_len = static_cast(strlen(ret)); + return ret; +} + +FORCE_INLINE +const char* from_hex_utf8(int64_t context, const char* text, int32_t text_len, + int32_t* out_len) { + if (text_len == 0) { + *out_len = 0; + return ""; + } + + // the input string should have a length multiple of two + if (text_len % 2 != 0) { + gdv_fn_context_set_error_msg( + context, "Error parsing hex string, length was not a multiple of two."); + *out_len = 0; + return ""; + } + + char* ret = reinterpret_cast(gdv_fn_context_arena_malloc(context, text_len / 2)); + + if (ret == nullptr) { + gdv_fn_context_set_error_msg(context, "Could not allocate memory for output string"); + *out_len = 0; + return ""; + } + + // converting hex encoded string to normal string + int32_t j = 0; + for (int32_t i = 0; i < text_len; i += 2) { + char b1 = text[i]; + char b2 = text[i + 1]; + if (isxdigit(b1) && isxdigit(b2)) { + // [a-fA-F0-9] + ret[j++] = to_binary_from_hex(b1) * 16 + to_binary_from_hex(b2); + } else { + gdv_fn_context_set_error_msg( + context, "Error parsing hex string, one or more bytes are not valid."); + *out_len = 0; + return ""; + } + } + *out_len = j; + return ret; +} } // extern "C" diff --git a/cpp/src/gandiva/precompiled/string_ops_test.cc b/cpp/src/gandiva/precompiled/string_ops_test.cc index 6221dffb30224..a9da16b6b78d4 100644 --- a/cpp/src/gandiva/precompiled/string_ops_test.cc +++ b/cpp/src/gandiva/precompiled/string_ops_test.cc @@ -1755,4 +1755,230 @@ TEST(TestStringOps, TestConvertToBigEndian) { #endif } +TEST(TestStringOps, TestToHex) { + gandiva::ExecutionContext ctx; + uint64_t ctx_ptr = reinterpret_cast(&ctx); + int32_t out_len = 0; + int32_t in_len = 0; + const char* out_str; + + in_len = 10; + char in_str[] = {0x54, 0x65, 0x73, 0x74, 0x53, 0x74, 0x72, 0x69, 0x6E, 0x67}; + out_str = to_hex_binary(ctx_ptr, in_str, in_len, &out_len); + std::string output = std::string(out_str, out_len); + EXPECT_EQ(out_len, 2 * in_len); + EXPECT_EQ(output, "54657374537472696E67"); + + in_len = 0; + out_str = to_hex_binary(ctx_ptr, "", in_len, &out_len); + output = std::string(out_str, out_len); + EXPECT_EQ(out_len, 0); + EXPECT_EQ(output, ""); + + in_len = 1; + char in_str_one_char[] = {0x54}; + out_str = to_hex_binary(ctx_ptr, in_str_one_char, in_len, &out_len); + output = std::string(out_str, out_len); + EXPECT_EQ(out_len, 2 * in_len); + EXPECT_EQ(output, "54"); + + in_len = 16; + char in_str_spaces[] = {0x54, 0x65, 0x73, 0x74, 0x20, 0x77, 0x69, 0x74, + 0x68, 0x20, 0x73, 0x70, 0x61, 0x63, 0x65, 0x73}; + out_str = to_hex_binary(ctx_ptr, in_str_spaces, in_len, &out_len); + output = std::string(out_str, out_len); + EXPECT_EQ(output, "54657374207769746820737061636573"); + + in_len = 20; + char in_str_break_line[] = {0x54, 0x65, 0x78, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x0A, + 0x62, 0x72, 0x65, 0x61, 0x6B, 0x20, 0x6C, 0x69, 0x6E, 0x65}; + out_str = to_hex_binary(ctx_ptr, in_str_break_line, in_len, &out_len); + output = std::string(out_str, out_len); + EXPECT_EQ(out_len, 2 * in_len); + EXPECT_EQ(output, "5465787420776974680A627265616B206C696E65"); + + in_len = 27; + char in_str_with_num[] = {0x54, 0x65, 0x73, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, + 0x20, 0x6E, 0x75, 0x6D, 0x62, 0x65, 0x72, 0x73, 0x20, + 0x31, 0x20, 0x2B, 0x20, 0x31, 0x20, 0x3D, 0x20, 0x32}; + out_str = to_hex_binary(ctx_ptr, in_str_with_num, in_len, &out_len); + output = std::string(out_str, out_len); + EXPECT_EQ(out_len, 2 * in_len); + EXPECT_EQ(output, "546573742077697468206E756D626572732031202B2031203D2032"); + + in_len = 22; + char in_str_with_tabs[] = {0x09, 0x0A, 0x09, 0x0A, 0x09, 0x0A, 0x09, 0x0A, + 0x0A, 0x0A, 0x09, 0x20, 0x61, 0x20, 0x6C, 0x65, + 0x74, 0x74, 0x40, 0x5D, 0x65, 0x72}; + out_str = to_hex_binary(ctx_ptr, in_str_with_tabs, in_len, &out_len); + output = std::string(out_str, out_len); + EXPECT_EQ(out_len, 2 * in_len); + EXPECT_EQ(output, "090A090A090A090A0A0A092061206C657474405D6572"); + + in_len = 22; + const char* binary_string = + "\x09\x0A\x09\x0A\x09\x0A\x09\x0A\x0A\x0A\x09\x20\x61\x20\x6C\x65\x74\x74\x40\x5D" + "\x65\x72"; + out_str = to_hex_binary(ctx_ptr, binary_string, in_len, &out_len); + output = std::string(out_str, out_len); + EXPECT_EQ(out_len, 2 * in_len); + EXPECT_EQ(output, "090A090A090A090A0A0A092061206C657474405D6572"); +} + +TEST(TestStringOps, TestToHexInt64) { + gandiva::ExecutionContext ctx; + uint64_t ctx_ptr = reinterpret_cast(&ctx); + int32_t out_len = 0; + const char* out_str; + + int64_t max_data = INT64_MAX; + out_str = to_hex_int64(ctx_ptr, max_data, &out_len); + std::string output = std::string(out_str, out_len); + EXPECT_FALSE(ctx.has_error()); + EXPECT_EQ(out_len, 16); + EXPECT_EQ(output, "7FFFFFFFFFFFFFFF"); + ctx.Reset(); + + int64_t min_data = INT64_MIN; + out_str = to_hex_int64(ctx_ptr, min_data, &out_len); + output = std::string(out_str, out_len); + EXPECT_FALSE(ctx.has_error()); + EXPECT_EQ(out_len, 16); + EXPECT_EQ(output, "8000000000000000"); + ctx.Reset(); + + int64_t zero_data = 0; + out_str = to_hex_int64(ctx_ptr, zero_data, &out_len); + output = std::string(out_str, out_len); + EXPECT_FALSE(ctx.has_error()); + EXPECT_EQ(out_len, 1); + EXPECT_EQ(output, "0"); + ctx.Reset(); + + int64_t minus_zero_data = -0; + out_str = to_hex_int64(ctx_ptr, minus_zero_data, &out_len); + output = std::string(out_str, out_len); + EXPECT_FALSE(ctx.has_error()); + EXPECT_EQ(out_len, 1); + EXPECT_EQ(output, "0"); + ctx.Reset(); + + int64_t minus_one_data = -1; + out_str = to_hex_int64(ctx_ptr, minus_one_data, &out_len); + output = std::string(out_str, out_len); + EXPECT_FALSE(ctx.has_error()); + EXPECT_EQ(out_len, 16); + EXPECT_EQ(output, "FFFFFFFFFFFFFFFF"); + ctx.Reset(); + + int64_t one_data = 1; + out_str = to_hex_int64(ctx_ptr, one_data, &out_len); + output = std::string(out_str, out_len); + EXPECT_FALSE(ctx.has_error()); + EXPECT_EQ(out_len, 1); + EXPECT_EQ(output, "1"); + ctx.Reset(); +} + +TEST(TestStringOps, TestToHexInt32) { + gandiva::ExecutionContext ctx; + uint64_t ctx_ptr = reinterpret_cast(&ctx); + int32_t out_len = 0; + const char* out_str; + + int32_t max_data = INT32_MAX; + out_str = to_hex_int32(ctx_ptr, max_data, &out_len); + std::string output = std::string(out_str, out_len); + EXPECT_FALSE(ctx.has_error()); + EXPECT_EQ(out_len, 8); + EXPECT_EQ(output, "7FFFFFFF"); + ctx.Reset(); + + int32_t min_data = INT32_MIN; + out_str = to_hex_int32(ctx_ptr, min_data, &out_len); + output = std::string(out_str, out_len); + EXPECT_FALSE(ctx.has_error()); + EXPECT_EQ(out_len, 8); + EXPECT_EQ(output, "80000000"); + ctx.Reset(); + + int32_t zero_data = 0; + out_str = to_hex_int32(ctx_ptr, zero_data, &out_len); + output = std::string(out_str, out_len); + EXPECT_FALSE(ctx.has_error()); + EXPECT_EQ(out_len, 1); + EXPECT_EQ(output, "0"); + ctx.Reset(); + + int32_t minus_zero_data = -0; + out_str = to_hex_int32(ctx_ptr, minus_zero_data, &out_len); + output = std::string(out_str, out_len); + EXPECT_FALSE(ctx.has_error()); + EXPECT_EQ(out_len, 1); + EXPECT_EQ(output, "0"); + ctx.Reset(); + + int32_t minus_one_data = -1; + out_str = to_hex_int32(ctx_ptr, minus_one_data, &out_len); + output = std::string(out_str, out_len); + EXPECT_FALSE(ctx.has_error()); + EXPECT_EQ(out_len, 8); + EXPECT_EQ(output, "FFFFFFFF"); + ctx.Reset(); + + int32_t one_data = 1; + out_str = to_hex_int32(ctx_ptr, one_data, &out_len); + output = std::string(out_str, out_len); + EXPECT_FALSE(ctx.has_error()); + EXPECT_EQ(out_len, 1); + EXPECT_EQ(output, "1"); + ctx.Reset(); +} + +TEST(TestStringOps, TestFromHex) { + gandiva::ExecutionContext ctx; + uint64_t ctx_ptr = reinterpret_cast(&ctx); + gdv_int32 out_len = 0; + const char* out_str; + + out_str = from_hex_utf8(ctx_ptr, "414243", 6, &out_len); + std::string output = std::string(out_str, out_len); + EXPECT_EQ(output, "ABC"); + + out_str = from_hex_utf8(ctx_ptr, "", 0, &out_len); + output = std::string(out_str, out_len); + EXPECT_EQ(output, ""); + + out_str = from_hex_utf8(ctx_ptr, "41", 2, &out_len); + output = std::string(out_str, out_len); + EXPECT_EQ(output, "A"); + + out_str = from_hex_utf8(ctx_ptr, "6d6D", 4, &out_len); + output = std::string(out_str, out_len); + EXPECT_EQ(output, "mm"); + + out_str = from_hex_utf8(ctx_ptr, "6f6d", 4, &out_len); + output = std::string(out_str, out_len); + EXPECT_EQ(output, "om"); + + out_str = from_hex_utf8(ctx_ptr, "4f4D", 4, &out_len); + output = std::string(out_str, out_len); + EXPECT_EQ(output, "OM"); + + out_str = from_hex_utf8(ctx_ptr, "T", 1, &out_len); + output = std::string(out_str, out_len); + EXPECT_EQ(output, ""); + EXPECT_THAT( + ctx.get_error(), + ::testing::HasSubstr("Error parsing hex string, length was not a multiple of")); + ctx.Reset(); + + out_str = from_hex_utf8(ctx_ptr, "\\x41\\x42\\x43", 12, &out_len); + output = std::string(out_str, out_len); + EXPECT_EQ(output, ""); + EXPECT_THAT( + ctx.get_error(), + ::testing::HasSubstr("Error parsing hex string, one or more bytes are not valid.")); + ctx.Reset(); +} } // namespace gandiva diff --git a/cpp/src/gandiva/precompiled/types.h b/cpp/src/gandiva/precompiled/types.h index 2e6e9c6eb7a76..a91240473d898 100644 --- a/cpp/src/gandiva/precompiled/types.h +++ b/cpp/src/gandiva/precompiled/types.h @@ -560,6 +560,16 @@ const char* right_utf8_int32(gdv_int64 context, const char* text, gdv_int32 text const char* binary_string(gdv_int64 context, const char* text, gdv_int32 text_len, gdv_int32* out_len); +const char* to_hex_binary(int64_t context, const char* text, int32_t text_len, + int32_t* out_len); + +const char* to_hex_int64(int64_t context, int64_t data, int32_t* out_len); + +const char* to_hex_int32(int64_t context, int32_t data, int32_t* out_len); + +const char* from_hex_utf8(int64_t context, const char* text, int32_t text_len, + int32_t* out_len); + int32_t castINT_utf8(int64_t context, const char* data, int32_t len); int64_t castBIGINT_utf8(int64_t context, const char* data, int32_t len); diff --git a/cpp/src/gandiva/tests/projector_test.cc b/cpp/src/gandiva/tests/projector_test.cc index dea66a792ba3d..c2a24742a6238 100644 --- a/cpp/src/gandiva/tests/projector_test.cc +++ b/cpp/src/gandiva/tests/projector_test.cc @@ -1643,4 +1643,100 @@ TEST_F(TestProjector, TestBround) { EXPECT_ARROW_ARRAY_EQUALS(exp_bround, outputs.at(0)); } +TEST_F(TestProjector, TestConcatFromHex) { + // schema for input fields + auto field0 = field("f0", arrow::utf8()); + auto schema = arrow::schema({field0}); + + // output fields + auto field_from_hex = field("fromhex", arrow::binary()); + + // Build expression + auto from_hex_exp = + TreeExprBuilder::MakeExpression("from_hex", {field0}, field_from_hex); + + std::shared_ptr projector; + auto status = Projector::Make(schema, {from_hex_exp}, TestConfiguration(), &projector); + EXPECT_TRUE(status.ok()) << status.message(); + + // Create a row-batch with some sample data + int num_records = 6; + auto array0 = MakeArrowArrayUtf8({"414243", "", "41", "4f4D", "6f6d", "4f"}, + {true, true, true, true, true, true}); + // expected output + auto exp_from_hex = MakeArrowArrayBinary({"ABC", "", "A", "OM", "om", "O"}, + {true, true, true, true, true, true}); + + // prepare input record batch + auto in_batch = arrow::RecordBatch::Make(schema, num_records, {array0}); + + // Evaluate expression + arrow::ArrayVector outputs; + status = projector->Evaluate(*in_batch, pool_, &outputs); + + EXPECT_TRUE(status.ok()) << status.message(); + + // Validate results + EXPECT_ARROW_ARRAY_EQUALS(exp_from_hex, outputs.at(0)); +} + +TEST_F(TestProjector, TestToHex) { + // schema for input fields + auto field_a = field("a", arrow::binary()); + auto field_b = field("b", arrow::int64()); + auto schema = arrow::schema({field_a, field_b}); + + // output fields + auto res_1 = field("res1", arrow::utf8()); + auto res_2 = field("res2", arrow::utf8()); + + auto node_a = TreeExprBuilder::MakeField(field_a); + auto to_hex = TreeExprBuilder::MakeFunction("to_hex", {node_a}, arrow::utf8()); + auto expr_1 = TreeExprBuilder::MakeExpression(to_hex, res_1); + + auto node_b = TreeExprBuilder::MakeField(field_b); + auto to_hex_numerical = + TreeExprBuilder::MakeFunction("to_hex", {node_b}, arrow::utf8()); + auto expr_2 = TreeExprBuilder::MakeExpression(to_hex_numerical, res_2); + + // Build a projector for the expressions. + std::shared_ptr projector; + auto status = + Projector::Make(schema, {expr_1, expr_2}, TestConfiguration(), &projector); + EXPECT_TRUE(status.ok()) << status.message(); + + // Create a row-batch with some sample data + int num_records = 5; + auto array_a = + MakeArrowArrayBinary({{0x66, 0x6F, 0x6F}, + {0x74, 0x65, 0x73, 0x74, 0x20, 0x73, 0x70, 0x61, 0x63, 0x65}, + {0x74, 0x65, 0x73, 0x74, 0x20, 0x4E, 0x55, 0x4D, 0x4A, 0x6F}, + {0x5B, 0x5D, 0x5B, 0x5B, 0x73, 0x64}, + {}}, + {true, true, true, true, false}); + + auto array_b = MakeArrowArrayInt64({6713199, 499918271520, -1, 1, 52323}, + {true, true, true, true, false}); + + // expected output + auto exp = MakeArrowArrayUtf8( + {"666F6F", "74657374207370616365", "74657374204E554D4A6F", "5B5D5B5B7364", ""}, + {true, true, true, true, false}); + + auto exp_numerical = + MakeArrowArrayUtf8({"666F6F", "7465737420", "FFFFFFFFFFFFFFFF", "1", ""}, + {true, true, true, true, false}); + + // prepare input record batch + auto in_batch = arrow::RecordBatch::Make(schema, num_records, {array_a, array_b}); + + // Evaluate expression + arrow::ArrayVector outputs; + status = projector->Evaluate(*in_batch, pool_, &outputs); + EXPECT_TRUE(status.ok()); + + // Validate results + EXPECT_ARROW_ARRAY_EQUALS(exp, outputs.at(0)); + EXPECT_ARROW_ARRAY_EQUALS(exp_numerical, outputs.at(1)); +} } // namespace gandiva