Skip to content

Commit

Permalink
Use a custom float encoding to reduce the number of log types and dic…
Browse files Browse the repository at this point in the history
…tionary variables (#38)

The custom encoding:
* encodes the float's displayed precision, eliminating the precision from the log type, which in turn eliminates redundant log types where the only difference was the float's precision.
* encodes an additional decimal digit precisely, reducing the number of float variables that would end up in the variable dictionary.
* is faster than the previous IEEE-754 encoding and decoding process.
  • Loading branch information
kirkrodrigues authored Nov 19, 2021
1 parent 20af5f2 commit 6e71307
Show file tree
Hide file tree
Showing 8 changed files with 206 additions and 235 deletions.
192 changes: 121 additions & 71 deletions components/core/src/EncodedVariableInterpreter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,10 +15,6 @@ using std::string;
using std::unordered_set;
using std::vector;

// Constants
// 1 sign + LogTypeDictionaryEntry::cMaxDigitsInRepresentableDoubleVar + 1 decimal point
static const size_t cMaxCharsInRepresentableDoubleVar = LogTypeDictionaryEntry::cMaxDigitsInRepresentableDoubleVar + 2;

encoded_variable_t EncodedVariableInterpreter::get_var_dict_id_range_begin () {
return m_var_dict_id_range_begin;
}
Expand Down Expand Up @@ -75,66 +71,135 @@ bool EncodedVariableInterpreter::convert_string_to_representable_integer_var (co
return true;
}

bool EncodedVariableInterpreter::convert_string_to_representable_double_var (const string& value, uint8_t& num_integer_digits, uint8_t& num_fractional_digits,
encoded_variable_t& encoded_var)
{
size_t length = value.length();
bool EncodedVariableInterpreter::convert_string_to_representable_double_var (const string& value, encoded_variable_t& encoded_var) {
if (value.empty()) {
// Can't convert an empty string
return false;
}

// Check for preceding negative sign
size_t first_digit_pos = 0;
if (first_digit_pos < length && '-' == value[first_digit_pos]) {
++first_digit_pos;
size_t pos = 0;
constexpr size_t cMaxDigitsInRepresentableDoubleVar = 16;
// +1 for decimal point
size_t max_length = cMaxDigitsInRepresentableDoubleVar + 1;

// Check for a negative sign
bool is_negative = false;
if ('-' == value[pos]) {
is_negative = true;
++pos;
// Include sign in max length
++max_length;
}

if (length > cMaxCharsInRepresentableDoubleVar) {
// Too many characters besides sign to represent precisely
return false;
}
} else {
// No negative sign, so check against max size - 1
if (length > cMaxCharsInRepresentableDoubleVar - 1) {
// Too many characters to represent precisely
return false;
}
// Check if value can be represented in encoded format
if (value.length() > max_length) {
return false;
}

// Find decimal point
size_t num_digits = 0;
size_t decimal_point_pos = string::npos;
for (size_t i = first_digit_pos; i < length; ++i) {
char c = value[i];
if ('.' == c) {
decimal_point_pos = i;
break;
} else if (!('0' <= c && c <= '9')) {
// Unrepresentable double character
uint64_t digits = 0;
for (; pos < value.length(); ++pos) {
auto c = value[pos];
if ('0' <= c && c <= '9') {
digits *= 10;
digits += (c - '0');
++num_digits;
} else if (string::npos == decimal_point_pos && '.' == c) {
decimal_point_pos = value.length() - 1 - pos;
} else {
// Invalid character
return false;
}
}
if (string::npos == decimal_point_pos) {
// Decimal point doesn't exist
if (string::npos == decimal_point_pos || 0 == decimal_point_pos || 0 == num_digits) {
// No decimal point found, decimal point is after all digits, or no digits found
return false;
}

num_integer_digits = decimal_point_pos - first_digit_pos;
// Encode into 64 bits with the following format (from MSB to LSB):
// - 1 bit : is negative
// - 4 bits: # of decimal digits minus 1
// - This format can represent doubles with between 1 and 16 decimal digits, so we use 4 bits and map the range [1, 16] to [0x0, 0xF]
// - 4 bits: position of the decimal from the right minus 1
// - To see why the position is taken from the right, consider (1) "-123456789012345.6", (2) "-.1234567890123456", and (3) ".1234567890123456"
// - For (1), the decimal point is at index 16 from the left and index 1 from the right.
// - For (2), the decimal point is at index 1 from the left and index 16 from the right.
// - For (3), the decimal point is at index 0 from the left and index 16 from the right.
// - So if we take the decimal position from the left, it can range from 0 to 16 because of the negative sign. Whereas from the right, the
// negative sign is inconsequential.
// - Thus, we use 4 bits and map the range [1, 16] to [0x0, 0xF].
// - 1 bit : unused
// - 54 bits: The digits of the double without the decimal, as an integer
uint64_t encoded_double = 0;
if (is_negative) {
encoded_double = 1;
}
encoded_double <<= 4;
encoded_double |= (num_digits - 1) & 0x0F;
encoded_double <<= 4;
encoded_double |= (decimal_point_pos - 1) & 0x0F;
encoded_double <<= 55;
encoded_double |= digits & 0x003FFFFFFFFFFFFF;
static_assert(sizeof(encoded_var) == sizeof(encoded_double), "sizeof(encoded_var) != sizeof(encoded_double)");
// NOTE: We use memcpy rather than reinterpret_cast to avoid violating strict aliasing; a smart compiler should optimize it to a register move
std::memcpy(&encoded_var, &encoded_double, sizeof(encoded_double));

// Check that remainder of string is purely numbers
for (size_t i = decimal_point_pos + 1; i < length; ++i) {
char c = value[i];
if (!('0' <= c && c <= '9')) {
return false;
}
return true;
}

void EncodedVariableInterpreter::convert_encoded_double_to_string (encoded_variable_t encoded_var, string& value) {
uint64_t encoded_double;
static_assert(sizeof(encoded_double) == sizeof(encoded_var), "sizeof(encoded_double) != sizeof(encoded_var)");
// NOTE: We use memcpy rather than reinterpret_cast to avoid violating strict aliasing; a smart compiler should optimize it to a register move
std::memcpy(&encoded_double, &encoded_var, sizeof(encoded_var));

// Decode according to the format described in EncodedVariableInterpreter::convert_string_to_representable_double_var
uint64_t digits = encoded_double & 0x003FFFFFFFFFFFFF;
encoded_double >>= 55;
uint8_t decimal_pos = (encoded_double & 0x0F) + 1;
encoded_double >>= 4;
uint8_t num_digits = (encoded_double & 0x0F) + 1;
encoded_double >>= 4;
bool is_negative = encoded_double > 0;

size_t value_length = num_digits + 1 + is_negative;
value.resize(value_length);
size_t num_chars_to_process = value_length;

// Add sign
if (is_negative) {
value[0] = '-';
--num_chars_to_process;
}

// Decode until the decimal or the non-zero digits are exhausted
size_t pos = value_length - 1;
for (; pos > (value_length - 1 - decimal_pos) && digits > 0; --pos) {
value[pos] = (char)('0' + (digits % 10));
digits /= 10;
--num_chars_to_process;
}

num_fractional_digits = length - (decimal_point_pos + 1);
if (digits > 0) {
// Skip decimal since it's added at the end
--pos;
--num_chars_to_process;

double result;
if (false == convert_string_to_double(value, result)) {
// Conversion failed
return false;
} else {
encoded_var = *reinterpret_cast<encoded_variable_t*>(&result);
while (digits > 0) {
value[pos--] = (char)('0' + (digits % 10));
digits /= 10;
--num_chars_to_process;
}
}

return true;
// Add remaining zeros
for (; num_chars_to_process > 0; --num_chars_to_process) {
value[pos--] = '0';
}

// Add decimal
value[value_length - 1 - decimal_pos] = '.';
}

void EncodedVariableInterpreter::encode_and_add_to_dictionary (const string& message, LogTypeDictionaryEntry& logtype_dict_entry,
Expand All @@ -151,12 +216,10 @@ void EncodedVariableInterpreter::encode_and_add_to_dictionary (const string& mes
while (logtype_dict_entry.parse_next_var(message, tok_begin_pos, next_delim_pos, last_var_end_pos, var_str)) {
// Encode variable
encoded_variable_t encoded_var;
uint8_t num_integer_digits;
uint8_t num_fractional_digits;
if (convert_string_to_representable_integer_var(var_str, encoded_var)) {
logtype_dict_entry.add_non_double_var();
} else if (convert_string_to_representable_double_var(var_str, num_integer_digits, num_fractional_digits, encoded_var)) {
logtype_dict_entry.add_double_var(num_integer_digits, num_fractional_digits);
} else if (convert_string_to_representable_double_var(var_str, encoded_var)) {
logtype_dict_entry.add_double_var();
} else {
// Variable string looks like a dictionary variable, so encode it as so
variable_dictionary_id_t id;
Expand Down Expand Up @@ -184,12 +247,10 @@ bool EncodedVariableInterpreter::decode_variables_into_message (const LogTypeDic
}

LogTypeDictionaryEntry::VarDelim var_delim;
uint8_t num_integer_digits;
uint8_t num_fractional_digits;
size_t constant_begin_pos = 0;
char double_str[cMaxCharsInRepresentableDoubleVar + 1];
string double_str;
for (size_t i = 0; i < num_vars_in_logtype; ++i) {
size_t var_position = logtype_dict_entry.get_var_info(i, var_delim, num_integer_digits, num_fractional_digits);
size_t var_position = logtype_dict_entry.get_var_info(i, var_delim);

// Add the constant that's between the last variable and this one
decompressed_msg.append(logtype_value, constant_begin_pos, var_position - constant_begin_pos);
Expand All @@ -201,22 +262,13 @@ bool EncodedVariableInterpreter::decode_variables_into_message (const LogTypeDic
auto var_dict_id = decode_var_dict_id(encoded_vars[i]);
decompressed_msg += var_dict.get_value(var_dict_id);
}

// Move past the variable delimiter
constant_begin_pos = var_position + 1;
} else { // LogTypeDictionaryEntry::VarDelim::Double == var_delim
double var_as_double = *reinterpret_cast<const double*>(&encoded_vars[i]);
int double_str_length = num_integer_digits + 1 + num_fractional_digits;
if (std::signbit(var_as_double)) {
++double_str_length;
}
snprintf(double_str, sizeof(double_str), "%0*.*f", double_str_length, num_fractional_digits, var_as_double);
convert_encoded_double_to_string(encoded_vars[i], double_str);

decompressed_msg += double_str;

// Move past the variable delimiter and the double's precision
constant_begin_pos = var_position + 2;
}
// Move past the variable delimiter
constant_begin_pos = var_position + 1;
}
// Append remainder of logtype, if any
if (constant_begin_pos < logtype_value.length()) {
Expand All @@ -234,14 +286,12 @@ bool EncodedVariableInterpreter::encode_and_search_dictionary (const string& var
throw OperationFailed(ErrorCode_BadParam, __FILENAME__, __LINE__);
}

uint8_t num_integer_digits;
uint8_t num_fractional_digits;
encoded_variable_t encoded_var;
if (convert_string_to_representable_integer_var(var_str, encoded_var)) {
LogTypeDictionaryEntry::add_non_double_var(logtype);
sub_query.add_non_dict_var(encoded_var);
} else if (convert_string_to_representable_double_var(var_str, num_integer_digits, num_fractional_digits, encoded_var)) {
LogTypeDictionaryEntry::add_double_var(num_integer_digits, num_fractional_digits, logtype);
} else if (convert_string_to_representable_double_var(var_str, encoded_var)) {
LogTypeDictionaryEntry::add_double_var(logtype);
sub_query.add_non_dict_var(encoded_var);
} else {
auto entry = var_dict.get_entry_matching_value(var_str, ignore_case);
Expand Down
20 changes: 14 additions & 6 deletions components/core/src/EncodedVariableInterpreter.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,8 @@
* Class to parse and encode strings into encoded variables and to interpret encoded variables back into strings. An encoded variable is one of:
* i) a variable dictionary ID, referring to an entry in the variable dictionary, or
* ii) a value, representing an integer variable exactly as it appears in the original log message, or
* iii) a value, representing a base-10, 15-digit IEEE-754 encoded double variable, where the number of integer and fractional digits is specified in the
* logtype.
* iii) a value, representing a base-10, 16-digit number with a decimal point, where at least one digit is after the decimal point, encoded with a custom
* format.
*
* To decode an encoded variable, the logtype specifies whether the variable is either:
* - i/ii, or
Expand Down Expand Up @@ -53,13 +53,21 @@ class EncodedVariableInterpreter {
static bool convert_string_to_representable_integer_var (const std::string& value, encoded_variable_t& encoded_var);
/**
* Converts the given string into a representable double variable if possible
* A representable double:
* - is base-10
* - has 16-digits with a decimal point, where at least one digit is after the decimal point
* - has an optional negative sign
* @param value
* @param num_integer_digits Number of digits before the decimal point
* @param num_fractional_digits Number of digits after the decimal point
* @param encoded_var
* @return true if was successfully converted, false otherwise
*/
static bool convert_string_to_representable_double_var (const std::string& value, uint8_t& num_integer_digits, uint8_t& num_fractional_digits,
encoded_variable_t& encoded_var);
static bool convert_string_to_representable_double_var (const std::string& value, encoded_variable_t& encoded_var);
/**
* Converts the given encoded double into a string
* @param encoded_var
* @param value
*/
static void convert_encoded_double_to_string (encoded_variable_t encoded_var, std::string& value);

/**
* Parses all variables from a message (while constructing the logtype) and encodes them (adding them to the variable dictionary if necessary)
Expand Down
15 changes: 4 additions & 11 deletions components/core/src/Grep.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -121,17 +121,10 @@ QueryToken::QueryToken (const string& query_string, const size_t begin_pos, cons

encoded_variable_t encoded_var;
bool converts_to_non_dict_var = false;
if (EncodedVariableInterpreter::convert_string_to_representable_integer_var(value_without_wildcards, encoded_var)) {
if (EncodedVariableInterpreter::convert_string_to_representable_integer_var(value_without_wildcards, encoded_var) ||
EncodedVariableInterpreter::convert_string_to_representable_double_var(value_without_wildcards, encoded_var))
{
converts_to_non_dict_var = true;
} else {
// Doesn't convert to an integer variable, so might be a double or dictionary variable
uint8_t num_integer_digits;
uint8_t num_fractional_digits;
if (EncodedVariableInterpreter::convert_string_to_representable_double_var(value_without_wildcards, num_integer_digits, num_fractional_digits,
encoded_var))
{
converts_to_non_dict_var = true;
}
}

if (!converts_to_non_dict_var) {
Expand Down Expand Up @@ -271,7 +264,7 @@ static bool process_var_token (const QueryToken& query_token, const Archive& arc
}

if (query_token.is_double_var()) {
LogTypeDictionaryEntry::add_wildcard_double_var(logtype);
LogTypeDictionaryEntry::add_double_var(logtype);
} else {
LogTypeDictionaryEntry::add_non_double_var(logtype);

Expand Down
Loading

0 comments on commit 6e71307

Please sign in to comment.