From c294a8b21d90352ca9f23ad3e5670b0d00cbf521 Mon Sep 17 00:00:00 2001 From: philo Date: Mon, 13 Jun 2022 23:04:37 +0800 Subject: [PATCH] Port the code from https://github.com/apache/arrow/pull/10195 --- cpp/src/gandiva/function_registry_string.cc | 15 ++ cpp/src/gandiva/precompiled/string_ops.cc | 126 +++++++++- .../gandiva/precompiled/string_ops_test.cc | 226 ++++++++++++++++++ cpp/src/gandiva/precompiled/types.h | 10 + 4 files changed, 372 insertions(+), 5 deletions(-) diff --git a/cpp/src/gandiva/function_registry_string.cc b/cpp/src/gandiva/function_registry_string.cc index f9cb907f568a1..5282c99634904 100644 --- a/cpp/src/gandiva/function_registry_string.cc +++ b/cpp/src/gandiva/function_registry_string.cc @@ -327,6 +327,21 @@ std::vector GetStringFunctionRegistry() { kResultNullIfNull, "url_decoder", NativeFunction::kNeedsContext | NativeFunction::kCanReturnErrors), + NativeFunction("to_hex", {"hex"}, DataTypeVector{binary()}, utf8(), + kResultNullIfNull, "to_hex_binary", NativeFunction::kNeedsContext), + + NativeFunction("to_hex", {"hex"}, DataTypeVector{utf8()}, utf8(), kResultNullIfNull, + "to_hex_binary", NativeFunction::kNeedsContext), + + NativeFunction("to_hex", {"hex"}, DataTypeVector{int64()}, utf8(), + kResultNullIfNull, "to_hex_int64", NativeFunction::kNeedsContext), + + NativeFunction("to_hex", {"hex"}, DataTypeVector{int32()}, utf8(), + kResultNullIfNull, "to_hex_int32", NativeFunction::kNeedsContext), + + NativeFunction("from_hex", {"unhex"}, DataTypeVector{utf8()}, binary(), + kResultNullIfNull, "from_hex_utf8", NativeFunction::kNeedsContext), + NativeFunction("conv", {}, DataTypeVector{utf8(), int32(), int32()}, utf8(), kResultNullInternal, "conv", NativeFunction::kNeedsContext | NativeFunction::kCanReturnErrors)}; diff --git a/cpp/src/gandiva/precompiled/string_ops.cc b/cpp/src/gandiva/precompiled/string_ops.cc index 5ae0b178dc424..367ebdb37e556 100644 --- a/cpp/src/gandiva/precompiled/string_ops.cc +++ b/cpp/src/gandiva/precompiled/string_ops.cc @@ -16,14 +16,17 @@ // under the License. // String functions +#include "arrow/util/logging.h" #include "arrow/util/value_parsing.h" extern "C" { -#include -#include -#include -#include +#include +#include +#include +#include +#include +#include #include "./types.h" @@ -1644,4 +1647,117 @@ const char* conv(gdv_int64 context, const char* input, gdv_int32 input_len, bool return out_str; } -} // extern "C" \ No newline at end of file +// Gets a binary object and returns its hexadecimal representation. That representation +// maps each byte in the input to a 2-length string containing a hexadecimal number. +// - Examples: +// - foo -> 666F6F = 66[f] 6F[o] 6F[o] +// - bar -> 626172 = 62[b] 61[a] 72[r] +FORCE_INLINE +const char* to_hex_binary(int64_t context, const char* text, int32_t text_len, + int32_t* out_len) { + if (text_len == 0) { + *out_len = 0; + return ""; + } + + auto ret = + reinterpret_cast(gdv_fn_context_arena_malloc(context, text_len * 2 + 1)); + + if (ret == nullptr) { + gdv_fn_context_set_error_msg(context, "Could not allocate memory for output string"); + *out_len = 0; + return ""; + } + + uint32_t ret_index = 0; + uint32_t max_len = static_cast(text_len) * 2; + uint32_t max_char_to_write = 4; + + for (gdv_int32 i = 0; i < text_len; i++) { + DCHECK(ret_index >= 0 && ret_index < max_len); + + int32_t ch = static_cast(text[i]) & 0xFF; + + ret_index += snprintf(ret + ret_index, max_char_to_write, "%02X", ch); + } + + *out_len = static_cast(ret_index); + return ret; +} + +FORCE_INLINE +const char* to_hex_int64(int64_t context, int64_t data, int32_t* out_len) { + const int64_t hex_long_max_size = 2 * sizeof(int64_t); + auto ret = + reinterpret_cast(gdv_fn_context_arena_malloc(context, hex_long_max_size)); + + if (ret == nullptr) { + gdv_fn_context_set_error_msg(context, "Could not allocate memory for output string"); + *out_len = 0; + return ""; + } + snprintf(ret, hex_long_max_size + 1, "%" PRIX64, data); + + *out_len = static_cast(strlen(ret)); + return ret; +} + +FORCE_INLINE +const char* to_hex_int32(int64_t context, int32_t data, int32_t* out_len) { + const int32_t max_size = 2 * sizeof(int32_t); + auto ret = reinterpret_cast(gdv_fn_context_arena_malloc(context, max_size)); + + if (ret == nullptr) { + gdv_fn_context_set_error_msg(context, "Could not allocate memory for output string"); + *out_len = 0; + return ""; + } + snprintf(ret, max_size + 1, "%" PRIX32, data); + + *out_len = static_cast(strlen(ret)); + return ret; +} + +FORCE_INLINE +const char* from_hex_utf8(int64_t context, const char* text, int32_t text_len, + int32_t* out_len) { + if (text_len == 0) { + *out_len = 0; + return ""; + } + + // the input string should have a length multiple of two + if (text_len % 2 != 0) { + gdv_fn_context_set_error_msg( + context, "Error parsing hex string, length was not a multiple of two."); + *out_len = 0; + return ""; + } + + char* ret = reinterpret_cast(gdv_fn_context_arena_malloc(context, text_len / 2)); + + if (ret == nullptr) { + gdv_fn_context_set_error_msg(context, "Could not allocate memory for output string"); + *out_len = 0; + return ""; + } + + // converting hex encoded string to normal string + int32_t j = 0; + for (int32_t i = 0; i < text_len; i += 2) { + char b1 = text[i]; + char b2 = text[i + 1]; + if (isxdigit(b1) && isxdigit(b2)) { + // [a-fA-F0-9] + ret[j++] = to_binary_from_hex(b1) * 16 + to_binary_from_hex(b2); + } else { + gdv_fn_context_set_error_msg( + context, "Error parsing hex string, one or more bytes are not valid."); + *out_len = 0; + return ""; + } + } + *out_len = j; + return ret; +} +} // extern "C" diff --git a/cpp/src/gandiva/precompiled/string_ops_test.cc b/cpp/src/gandiva/precompiled/string_ops_test.cc index a672c65183541..3724600a520e7 100644 --- a/cpp/src/gandiva/precompiled/string_ops_test.cc +++ b/cpp/src/gandiva/precompiled/string_ops_test.cc @@ -1208,4 +1208,230 @@ TEST(TestStringOps, TestConv) { EXPECT_EQ(out_valid, false); } +TEST(TestStringOps, TestToHex) { + gandiva::ExecutionContext ctx; + uint64_t ctx_ptr = reinterpret_cast(&ctx); + int32_t out_len = 0; + int32_t in_len = 0; + const char* out_str; + + in_len = 10; + char in_str[] = {0x54, 0x65, 0x73, 0x74, 0x53, 0x74, 0x72, 0x69, 0x6E, 0x67}; + out_str = to_hex_binary(ctx_ptr, in_str, in_len, &out_len); + std::string output = std::string(out_str, out_len); + EXPECT_EQ(out_len, 2 * in_len); + EXPECT_EQ(output, "54657374537472696E67"); + + in_len = 0; + out_str = to_hex_binary(ctx_ptr, "", in_len, &out_len); + output = std::string(out_str, out_len); + EXPECT_EQ(out_len, 0); + EXPECT_EQ(output, ""); + + in_len = 1; + char in_str_one_char[] = {0x54}; + out_str = to_hex_binary(ctx_ptr, in_str_one_char, in_len, &out_len); + output = std::string(out_str, out_len); + EXPECT_EQ(out_len, 2 * in_len); + EXPECT_EQ(output, "54"); + + in_len = 16; + char in_str_spaces[] = {0x54, 0x65, 0x73, 0x74, 0x20, 0x77, 0x69, 0x74, + 0x68, 0x20, 0x73, 0x70, 0x61, 0x63, 0x65, 0x73}; + out_str = to_hex_binary(ctx_ptr, in_str_spaces, in_len, &out_len); + output = std::string(out_str, out_len); + EXPECT_EQ(output, "54657374207769746820737061636573"); + + in_len = 20; + char in_str_break_line[] = {0x54, 0x65, 0x78, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x0A, + 0x62, 0x72, 0x65, 0x61, 0x6B, 0x20, 0x6C, 0x69, 0x6E, 0x65}; + out_str = to_hex_binary(ctx_ptr, in_str_break_line, in_len, &out_len); + output = std::string(out_str, out_len); + EXPECT_EQ(out_len, 2 * in_len); + EXPECT_EQ(output, "5465787420776974680A627265616B206C696E65"); + + in_len = 27; + char in_str_with_num[] = {0x54, 0x65, 0x73, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, + 0x20, 0x6E, 0x75, 0x6D, 0x62, 0x65, 0x72, 0x73, 0x20, + 0x31, 0x20, 0x2B, 0x20, 0x31, 0x20, 0x3D, 0x20, 0x32}; + out_str = to_hex_binary(ctx_ptr, in_str_with_num, in_len, &out_len); + output = std::string(out_str, out_len); + EXPECT_EQ(out_len, 2 * in_len); + EXPECT_EQ(output, "546573742077697468206E756D626572732031202B2031203D2032"); + + in_len = 22; + char in_str_with_tabs[] = {0x09, 0x0A, 0x09, 0x0A, 0x09, 0x0A, 0x09, 0x0A, + 0x0A, 0x0A, 0x09, 0x20, 0x61, 0x20, 0x6C, 0x65, + 0x74, 0x74, 0x40, 0x5D, 0x65, 0x72}; + out_str = to_hex_binary(ctx_ptr, in_str_with_tabs, in_len, &out_len); + output = std::string(out_str, out_len); + EXPECT_EQ(out_len, 2 * in_len); + EXPECT_EQ(output, "090A090A090A090A0A0A092061206C657474405D6572"); + + in_len = 22; + const char* binary_string = + "\x09\x0A\x09\x0A\x09\x0A\x09\x0A\x0A\x0A\x09\x20\x61\x20\x6C\x65\x74\x74\x40\x5D" + "\x65\x72"; + out_str = to_hex_binary(ctx_ptr, binary_string, in_len, &out_len); + output = std::string(out_str, out_len); + EXPECT_EQ(out_len, 2 * in_len); + EXPECT_EQ(output, "090A090A090A090A0A0A092061206C657474405D6572"); +} + +TEST(TestStringOps, TestToHexInt64) { + gandiva::ExecutionContext ctx; + uint64_t ctx_ptr = reinterpret_cast(&ctx); + int32_t out_len = 0; + const char* out_str; + + int64_t max_data = INT64_MAX; + out_str = to_hex_int64(ctx_ptr, max_data, &out_len); + std::string output = std::string(out_str, out_len); + EXPECT_FALSE(ctx.has_error()); + EXPECT_EQ(out_len, 16); + EXPECT_EQ(output, "7FFFFFFFFFFFFFFF"); + ctx.Reset(); + + int64_t min_data = INT64_MIN; + out_str = to_hex_int64(ctx_ptr, min_data, &out_len); + output = std::string(out_str, out_len); + EXPECT_FALSE(ctx.has_error()); + EXPECT_EQ(out_len, 16); + EXPECT_EQ(output, "8000000000000000"); + ctx.Reset(); + + int64_t zero_data = 0; + out_str = to_hex_int64(ctx_ptr, zero_data, &out_len); + output = std::string(out_str, out_len); + EXPECT_FALSE(ctx.has_error()); + EXPECT_EQ(out_len, 1); + EXPECT_EQ(output, "0"); + ctx.Reset(); + + int64_t minus_zero_data = -0; + out_str = to_hex_int64(ctx_ptr, minus_zero_data, &out_len); + output = std::string(out_str, out_len); + EXPECT_FALSE(ctx.has_error()); + EXPECT_EQ(out_len, 1); + EXPECT_EQ(output, "0"); + ctx.Reset(); + + int64_t minus_one_data = -1; + out_str = to_hex_int64(ctx_ptr, minus_one_data, &out_len); + output = std::string(out_str, out_len); + EXPECT_FALSE(ctx.has_error()); + EXPECT_EQ(out_len, 16); + EXPECT_EQ(output, "FFFFFFFFFFFFFFFF"); + ctx.Reset(); + + int64_t one_data = 1; + out_str = to_hex_int64(ctx_ptr, one_data, &out_len); + output = std::string(out_str, out_len); + EXPECT_FALSE(ctx.has_error()); + EXPECT_EQ(out_len, 1); + EXPECT_EQ(output, "1"); + ctx.Reset(); +} + +TEST(TestStringOps, TestToHexInt32) { + gandiva::ExecutionContext ctx; + uint64_t ctx_ptr = reinterpret_cast(&ctx); + int32_t out_len = 0; + const char* out_str; + + int32_t max_data = INT32_MAX; + out_str = to_hex_int32(ctx_ptr, max_data, &out_len); + std::string output = std::string(out_str, out_len); + EXPECT_FALSE(ctx.has_error()); + EXPECT_EQ(out_len, 8); + EXPECT_EQ(output, "7FFFFFFF"); + ctx.Reset(); + + int32_t min_data = INT32_MIN; + out_str = to_hex_int32(ctx_ptr, min_data, &out_len); + output = std::string(out_str, out_len); + EXPECT_FALSE(ctx.has_error()); + EXPECT_EQ(out_len, 8); + EXPECT_EQ(output, "80000000"); + ctx.Reset(); + + int32_t zero_data = 0; + out_str = to_hex_int32(ctx_ptr, zero_data, &out_len); + output = std::string(out_str, out_len); + EXPECT_FALSE(ctx.has_error()); + EXPECT_EQ(out_len, 1); + EXPECT_EQ(output, "0"); + ctx.Reset(); + + int32_t minus_zero_data = -0; + out_str = to_hex_int32(ctx_ptr, minus_zero_data, &out_len); + output = std::string(out_str, out_len); + EXPECT_FALSE(ctx.has_error()); + EXPECT_EQ(out_len, 1); + EXPECT_EQ(output, "0"); + ctx.Reset(); + + int32_t minus_one_data = -1; + out_str = to_hex_int32(ctx_ptr, minus_one_data, &out_len); + output = std::string(out_str, out_len); + EXPECT_FALSE(ctx.has_error()); + EXPECT_EQ(out_len, 8); + EXPECT_EQ(output, "FFFFFFFF"); + ctx.Reset(); + + int32_t one_data = 1; + out_str = to_hex_int32(ctx_ptr, one_data, &out_len); + output = std::string(out_str, out_len); + EXPECT_FALSE(ctx.has_error()); + EXPECT_EQ(out_len, 1); + EXPECT_EQ(output, "1"); + ctx.Reset(); +} + +TEST(TestStringOps, TestFromHex) { + gandiva::ExecutionContext ctx; + uint64_t ctx_ptr = reinterpret_cast(&ctx); + gdv_int32 out_len = 0; + const char* out_str; + + out_str = from_hex_utf8(ctx_ptr, "414243", 6, &out_len); + std::string output = std::string(out_str, out_len); + EXPECT_EQ(output, "ABC"); + + out_str = from_hex_utf8(ctx_ptr, "", 0, &out_len); + output = std::string(out_str, out_len); + EXPECT_EQ(output, ""); + + out_str = from_hex_utf8(ctx_ptr, "41", 2, &out_len); + output = std::string(out_str, out_len); + EXPECT_EQ(output, "A"); + + out_str = from_hex_utf8(ctx_ptr, "6d6D", 4, &out_len); + output = std::string(out_str, out_len); + EXPECT_EQ(output, "mm"); + + out_str = from_hex_utf8(ctx_ptr, "6f6d", 4, &out_len); + output = std::string(out_str, out_len); + EXPECT_EQ(output, "om"); + + out_str = from_hex_utf8(ctx_ptr, "4f4D", 4, &out_len); + output = std::string(out_str, out_len); + EXPECT_EQ(output, "OM"); + + out_str = from_hex_utf8(ctx_ptr, "T", 1, &out_len); + output = std::string(out_str, out_len); + EXPECT_EQ(output, ""); + EXPECT_THAT( + ctx.get_error(), + ::testing::HasSubstr("Error parsing hex string, length was not a multiple of")); + ctx.Reset(); + + out_str = from_hex_utf8(ctx_ptr, "\\x41\\x42\\x43", 12, &out_len); + output = std::string(out_str, out_len); + EXPECT_EQ(output, ""); + EXPECT_THAT( + ctx.get_error(), + ::testing::HasSubstr("Error parsing hex string, one or more bytes are not valid.")); + ctx.Reset(); +} } // namespace gandiva diff --git a/cpp/src/gandiva/precompiled/types.h b/cpp/src/gandiva/precompiled/types.h index e69269cca97ac..579970ea4be6f 100644 --- a/cpp/src/gandiva/precompiled/types.h +++ b/cpp/src/gandiva/precompiled/types.h @@ -507,6 +507,16 @@ const char* castVARCHAR_float64_int64(int64_t context, double value, int64_t len const char* binary_string(gdv_int64 context, const char* text, gdv_int32 text_len, gdv_int32* out_len); +const char* to_hex_binary(int64_t context, const char* text, int32_t text_len, + int32_t* out_len); + +const char* to_hex_int64(int64_t context, int64_t data, int32_t* out_len); + +const char* to_hex_int32(int64_t context, int32_t data, int32_t* out_len); + +const char* from_hex_utf8(int64_t context, const char* text, int32_t text_len, + int32_t* out_len); + int32_t castINT_utf8(int64_t context, const char* data, int32_t len); int64_t castBIGINT_utf8(int64_t context, const char* data, int32_t len);