Skip to content

Commit

Permalink
Add TO_HEX and FROM_HEX function in Gandiva
Browse files Browse the repository at this point in the history
  • Loading branch information
anthonylouisbsb committed Nov 17, 2021
1 parent 34b8604 commit 924aa15
Show file tree
Hide file tree
Showing 5 changed files with 463 additions and 0 deletions.
15 changes: 15 additions & 0 deletions cpp/src/gandiva/function_registry_string.cc
Original file line number Diff line number Diff line change
Expand Up @@ -404,6 +404,21 @@ std::vector<NativeFunction> GetStringFunctionRegistry() {
kResultNullIfNull, "gdv_fn_castVARBINARY_float64_int64",
NativeFunction::kNeedsContext | NativeFunction::kCanReturnErrors),

NativeFunction("to_hex", {"hex"}, DataTypeVector{binary()}, utf8(),
kResultNullIfNull, "to_hex_binary", NativeFunction::kNeedsContext),

NativeFunction("to_hex", {"hex"}, DataTypeVector{utf8()}, utf8(), kResultNullIfNull,
"to_hex_binary", NativeFunction::kNeedsContext),

NativeFunction("to_hex", {"hex"}, DataTypeVector{int64()}, utf8(),
kResultNullIfNull, "to_hex_int64", NativeFunction::kNeedsContext),

NativeFunction("to_hex", {"hex"}, DataTypeVector{int32()}, utf8(),
kResultNullIfNull, "to_hex_int32", NativeFunction::kNeedsContext),

NativeFunction("from_hex", {"unhex"}, DataTypeVector{utf8()}, binary(),
kResultNullIfNull, "from_hex_utf8", NativeFunction::kNeedsContext),

NativeFunction("split_part", {}, DataTypeVector{utf8(), utf8(), int32()}, utf8(),
kResultNullIfNull, "split_part",
NativeFunction::kNeedsContext | NativeFunction::kCanReturnErrors)};
Expand Down
116 changes: 116 additions & 0 deletions cpp/src/gandiva/precompiled/string_ops.cc
Original file line number Diff line number Diff line change
Expand Up @@ -16,11 +16,13 @@
// under the License.

// String functions
#include "arrow/util/logging.h"
#include "arrow/util/value_parsing.h"

extern "C" {

#include <algorithm>
#include <cinttypes>
#include <climits>
#include <cstdio>
#include <cstdlib>
Expand Down Expand Up @@ -2195,4 +2197,118 @@ const char* byte_substr_binary_int32_int32(gdv_int64 context, const char* text,
memcpy(ret, text + startPos, *out_len);
return ret;
}

// Gets a binary object and returns its hexadecimal representation. That representation
// maps each byte in the input to a 2-length string containing a hexadecimal number.
// - Examples:
// - foo -> 666F6F = 66[f] 6F[o] 6F[o]
// - bar -> 626172 = 62[b] 61[a] 72[r]
FORCE_INLINE
const char* to_hex_binary(int64_t context, const char* text, int32_t text_len,
int32_t* out_len) {
if (text_len == 0) {
*out_len = 0;
return "";
}

auto ret =
reinterpret_cast<char*>(gdv_fn_context_arena_malloc(context, text_len * 2 + 1));

if (ret == nullptr) {
gdv_fn_context_set_error_msg(context, "Could not allocate memory for output string");
*out_len = 0;
return "";
}

uint32_t ret_index = 0;
uint32_t max_len = static_cast<uint32_t>(text_len) * 2;
uint32_t max_char_to_write = 4;

for (gdv_int32 i = 0; i < text_len; i++) {
DCHECK(ret_index >= 0 && ret_index < max_len);

int32_t ch = static_cast<int32_t>(text[i]) & 0xFF;

ret_index += snprintf(ret + ret_index, max_char_to_write, "%02X", ch);
}

*out_len = static_cast<int32_t>(ret_index);
return ret;
}

FORCE_INLINE
const char* to_hex_int64(int64_t context, int64_t data, int32_t* out_len) {
const int64_t hex_long_max_size = 2 * sizeof(int64_t);
auto ret =
reinterpret_cast<char*>(gdv_fn_context_arena_malloc(context, hex_long_max_size));

if (ret == nullptr) {
gdv_fn_context_set_error_msg(context, "Could not allocate memory for output string");
*out_len = 0;
return "";
}
snprintf(ret, hex_long_max_size + 1, "%" PRIX64, data);

*out_len = static_cast<int32_t>(strlen(ret));
return ret;
}

FORCE_INLINE
const char* to_hex_int32(int64_t context, int32_t data, int32_t* out_len) {
const int32_t max_size = 2 * sizeof(int32_t);
auto ret = reinterpret_cast<char*>(gdv_fn_context_arena_malloc(context, max_size));

if (ret == nullptr) {
gdv_fn_context_set_error_msg(context, "Could not allocate memory for output string");
*out_len = 0;
return "";
}
snprintf(ret, max_size + 1, "%" PRIX32, data);

*out_len = static_cast<int32_t>(strlen(ret));
return ret;
}

FORCE_INLINE
const char* from_hex_utf8(int64_t context, const char* text, int32_t text_len,
int32_t* out_len) {
if (text_len == 0) {
*out_len = 0;
return "";
}

// the input string should have a length multiple of two
if (text_len % 2 != 0) {
gdv_fn_context_set_error_msg(
context, "Error parsing hex string, length was not a multiple of two.");
*out_len = 0;
return "";
}

char* ret = reinterpret_cast<char*>(gdv_fn_context_arena_malloc(context, text_len / 2));

if (ret == nullptr) {
gdv_fn_context_set_error_msg(context, "Could not allocate memory for output string");
*out_len = 0;
return "";
}

// converting hex encoded string to normal string
int32_t j = 0;
for (int32_t i = 0; i < text_len; i += 2) {
char b1 = text[i];
char b2 = text[i + 1];
if (isxdigit(b1) && isxdigit(b2)) {
// [a-fA-F0-9]
ret[j++] = to_binary_from_hex(b1) * 16 + to_binary_from_hex(b2);
} else {
gdv_fn_context_set_error_msg(
context, "Error parsing hex string, one or more bytes are not valid.");
*out_len = 0;
return "";
}
}
*out_len = j;
return ret;
}
} // extern "C"
226 changes: 226 additions & 0 deletions cpp/src/gandiva/precompiled/string_ops_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1755,4 +1755,230 @@ TEST(TestStringOps, TestConvertToBigEndian) {
#endif
}

TEST(TestStringOps, TestToHex) {
gandiva::ExecutionContext ctx;
uint64_t ctx_ptr = reinterpret_cast<int64_t>(&ctx);
int32_t out_len = 0;
int32_t in_len = 0;
const char* out_str;

in_len = 10;
char in_str[] = {0x54, 0x65, 0x73, 0x74, 0x53, 0x74, 0x72, 0x69, 0x6E, 0x67};
out_str = to_hex_binary(ctx_ptr, in_str, in_len, &out_len);
std::string output = std::string(out_str, out_len);
EXPECT_EQ(out_len, 2 * in_len);
EXPECT_EQ(output, "54657374537472696E67");

in_len = 0;
out_str = to_hex_binary(ctx_ptr, "", in_len, &out_len);
output = std::string(out_str, out_len);
EXPECT_EQ(out_len, 0);
EXPECT_EQ(output, "");

in_len = 1;
char in_str_one_char[] = {0x54};
out_str = to_hex_binary(ctx_ptr, in_str_one_char, in_len, &out_len);
output = std::string(out_str, out_len);
EXPECT_EQ(out_len, 2 * in_len);
EXPECT_EQ(output, "54");

in_len = 16;
char in_str_spaces[] = {0x54, 0x65, 0x73, 0x74, 0x20, 0x77, 0x69, 0x74,
0x68, 0x20, 0x73, 0x70, 0x61, 0x63, 0x65, 0x73};
out_str = to_hex_binary(ctx_ptr, in_str_spaces, in_len, &out_len);
output = std::string(out_str, out_len);
EXPECT_EQ(output, "54657374207769746820737061636573");

in_len = 20;
char in_str_break_line[] = {0x54, 0x65, 0x78, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x0A,
0x62, 0x72, 0x65, 0x61, 0x6B, 0x20, 0x6C, 0x69, 0x6E, 0x65};
out_str = to_hex_binary(ctx_ptr, in_str_break_line, in_len, &out_len);
output = std::string(out_str, out_len);
EXPECT_EQ(out_len, 2 * in_len);
EXPECT_EQ(output, "5465787420776974680A627265616B206C696E65");

in_len = 27;
char in_str_with_num[] = {0x54, 0x65, 0x73, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68,
0x20, 0x6E, 0x75, 0x6D, 0x62, 0x65, 0x72, 0x73, 0x20,
0x31, 0x20, 0x2B, 0x20, 0x31, 0x20, 0x3D, 0x20, 0x32};
out_str = to_hex_binary(ctx_ptr, in_str_with_num, in_len, &out_len);
output = std::string(out_str, out_len);
EXPECT_EQ(out_len, 2 * in_len);
EXPECT_EQ(output, "546573742077697468206E756D626572732031202B2031203D2032");

in_len = 22;
char in_str_with_tabs[] = {0x09, 0x0A, 0x09, 0x0A, 0x09, 0x0A, 0x09, 0x0A,
0x0A, 0x0A, 0x09, 0x20, 0x61, 0x20, 0x6C, 0x65,
0x74, 0x74, 0x40, 0x5D, 0x65, 0x72};
out_str = to_hex_binary(ctx_ptr, in_str_with_tabs, in_len, &out_len);
output = std::string(out_str, out_len);
EXPECT_EQ(out_len, 2 * in_len);
EXPECT_EQ(output, "090A090A090A090A0A0A092061206C657474405D6572");

in_len = 22;
const char* binary_string =
"\x09\x0A\x09\x0A\x09\x0A\x09\x0A\x0A\x0A\x09\x20\x61\x20\x6C\x65\x74\x74\x40\x5D"
"\x65\x72";
out_str = to_hex_binary(ctx_ptr, binary_string, in_len, &out_len);
output = std::string(out_str, out_len);
EXPECT_EQ(out_len, 2 * in_len);
EXPECT_EQ(output, "090A090A090A090A0A0A092061206C657474405D6572");
}

TEST(TestStringOps, TestToHexInt64) {
gandiva::ExecutionContext ctx;
uint64_t ctx_ptr = reinterpret_cast<int64_t>(&ctx);
int32_t out_len = 0;
const char* out_str;

int64_t max_data = INT64_MAX;
out_str = to_hex_int64(ctx_ptr, max_data, &out_len);
std::string output = std::string(out_str, out_len);
EXPECT_FALSE(ctx.has_error());
EXPECT_EQ(out_len, 16);
EXPECT_EQ(output, "7FFFFFFFFFFFFFFF");
ctx.Reset();

int64_t min_data = INT64_MIN;
out_str = to_hex_int64(ctx_ptr, min_data, &out_len);
output = std::string(out_str, out_len);
EXPECT_FALSE(ctx.has_error());
EXPECT_EQ(out_len, 16);
EXPECT_EQ(output, "8000000000000000");
ctx.Reset();

int64_t zero_data = 0;
out_str = to_hex_int64(ctx_ptr, zero_data, &out_len);
output = std::string(out_str, out_len);
EXPECT_FALSE(ctx.has_error());
EXPECT_EQ(out_len, 1);
EXPECT_EQ(output, "0");
ctx.Reset();

int64_t minus_zero_data = -0;
out_str = to_hex_int64(ctx_ptr, minus_zero_data, &out_len);
output = std::string(out_str, out_len);
EXPECT_FALSE(ctx.has_error());
EXPECT_EQ(out_len, 1);
EXPECT_EQ(output, "0");
ctx.Reset();

int64_t minus_one_data = -1;
out_str = to_hex_int64(ctx_ptr, minus_one_data, &out_len);
output = std::string(out_str, out_len);
EXPECT_FALSE(ctx.has_error());
EXPECT_EQ(out_len, 16);
EXPECT_EQ(output, "FFFFFFFFFFFFFFFF");
ctx.Reset();

int64_t one_data = 1;
out_str = to_hex_int64(ctx_ptr, one_data, &out_len);
output = std::string(out_str, out_len);
EXPECT_FALSE(ctx.has_error());
EXPECT_EQ(out_len, 1);
EXPECT_EQ(output, "1");
ctx.Reset();
}

TEST(TestStringOps, TestToHexInt32) {
gandiva::ExecutionContext ctx;
uint64_t ctx_ptr = reinterpret_cast<int64_t>(&ctx);
int32_t out_len = 0;
const char* out_str;

int32_t max_data = INT32_MAX;
out_str = to_hex_int32(ctx_ptr, max_data, &out_len);
std::string output = std::string(out_str, out_len);
EXPECT_FALSE(ctx.has_error());
EXPECT_EQ(out_len, 8);
EXPECT_EQ(output, "7FFFFFFF");
ctx.Reset();

int32_t min_data = INT32_MIN;
out_str = to_hex_int32(ctx_ptr, min_data, &out_len);
output = std::string(out_str, out_len);
EXPECT_FALSE(ctx.has_error());
EXPECT_EQ(out_len, 8);
EXPECT_EQ(output, "80000000");
ctx.Reset();

int32_t zero_data = 0;
out_str = to_hex_int32(ctx_ptr, zero_data, &out_len);
output = std::string(out_str, out_len);
EXPECT_FALSE(ctx.has_error());
EXPECT_EQ(out_len, 1);
EXPECT_EQ(output, "0");
ctx.Reset();

int32_t minus_zero_data = -0;
out_str = to_hex_int32(ctx_ptr, minus_zero_data, &out_len);
output = std::string(out_str, out_len);
EXPECT_FALSE(ctx.has_error());
EXPECT_EQ(out_len, 1);
EXPECT_EQ(output, "0");
ctx.Reset();

int32_t minus_one_data = -1;
out_str = to_hex_int32(ctx_ptr, minus_one_data, &out_len);
output = std::string(out_str, out_len);
EXPECT_FALSE(ctx.has_error());
EXPECT_EQ(out_len, 8);
EXPECT_EQ(output, "FFFFFFFF");
ctx.Reset();

int32_t one_data = 1;
out_str = to_hex_int32(ctx_ptr, one_data, &out_len);
output = std::string(out_str, out_len);
EXPECT_FALSE(ctx.has_error());
EXPECT_EQ(out_len, 1);
EXPECT_EQ(output, "1");
ctx.Reset();
}

TEST(TestStringOps, TestFromHex) {
gandiva::ExecutionContext ctx;
uint64_t ctx_ptr = reinterpret_cast<gdv_int64>(&ctx);
gdv_int32 out_len = 0;
const char* out_str;

out_str = from_hex_utf8(ctx_ptr, "414243", 6, &out_len);
std::string output = std::string(out_str, out_len);
EXPECT_EQ(output, "ABC");

out_str = from_hex_utf8(ctx_ptr, "", 0, &out_len);
output = std::string(out_str, out_len);
EXPECT_EQ(output, "");

out_str = from_hex_utf8(ctx_ptr, "41", 2, &out_len);
output = std::string(out_str, out_len);
EXPECT_EQ(output, "A");

out_str = from_hex_utf8(ctx_ptr, "6d6D", 4, &out_len);
output = std::string(out_str, out_len);
EXPECT_EQ(output, "mm");

out_str = from_hex_utf8(ctx_ptr, "6f6d", 4, &out_len);
output = std::string(out_str, out_len);
EXPECT_EQ(output, "om");

out_str = from_hex_utf8(ctx_ptr, "4f4D", 4, &out_len);
output = std::string(out_str, out_len);
EXPECT_EQ(output, "OM");

out_str = from_hex_utf8(ctx_ptr, "T", 1, &out_len);
output = std::string(out_str, out_len);
EXPECT_EQ(output, "");
EXPECT_THAT(
ctx.get_error(),
::testing::HasSubstr("Error parsing hex string, length was not a multiple of"));
ctx.Reset();

out_str = from_hex_utf8(ctx_ptr, "\\x41\\x42\\x43", 12, &out_len);
output = std::string(out_str, out_len);
EXPECT_EQ(output, "");
EXPECT_THAT(
ctx.get_error(),
::testing::HasSubstr("Error parsing hex string, one or more bytes are not valid."));
ctx.Reset();
}
} // namespace gandiva
Loading

0 comments on commit 924aa15

Please sign in to comment.