Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: Add support for escaping characters in KQL key names. #560

Merged
merged 7 commits into from
Nov 13, 2024
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 6 additions & 1 deletion components/core/src/clp_s/JsonParser.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,12 @@ JsonParser::JsonParser(JsonParserOption const& option)
}

if (false == m_timestamp_key.empty()) {
clp_s::StringUtils::tokenize_column_descriptor(m_timestamp_key, m_timestamp_column);
if (false
== clp_s::StringUtils::tokenize_column_descriptor(m_timestamp_key, m_timestamp_column))
{
SPDLOG_ERROR("Can not parse invalid timestamp key: \"{}\"", m_timestamp_key);
throw OperationFailed(ErrorCodeBadParam, __FILENAME__, __LINE__);
}
}

for (auto& file_path : option.file_paths) {
Expand Down
4 changes: 3 additions & 1 deletion components/core/src/clp_s/TimestampDictionaryReader.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,9 @@ void TimestampDictionaryReader::read_new_entries() {
TimestampEntry entry;
std::vector<std::string> tokens;
entry.try_read_from_file(m_dictionary_decompressor);
StringUtils::tokenize_column_descriptor(entry.get_key_name(), tokens);
if (false == StringUtils::tokenize_column_descriptor(entry.get_key_name(), tokens)) {
throw OperationFailed(ErrorCodeCorrupt, __FILENAME__, __LINE__);
}
m_entries.emplace_back(std::move(entry));

// TODO: Currently, we only allow a single authoritative timestamp column at ingestion time,
Expand Down
32 changes: 23 additions & 9 deletions components/core/src/clp_s/Utils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -427,18 +427,32 @@ bool StringUtils::convert_string_to_double(std::string const& raw, double& conve
return true;
}

void StringUtils::tokenize_column_descriptor(
bool StringUtils::tokenize_column_descriptor(
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do you think .a is an illegal case at this step, or leave it to populate_column_mapping?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'd agree that it is. Currently the code only complains if the last token is "", but I'll change it to complain if any token is "" and add some tests.

std::string const& descriptor,
std::vector<std::string>& tokens
) {
// TODO: handle escaped . correctly
auto start = 0U;
auto end = descriptor.find('.');
while (end != std::string::npos) {
tokens.push_back(descriptor.substr(start, end - start));
start = end + 1;
end = descriptor.find('.', start);
// TODO: add support for unicoce sequences e.g. \u263A
gibber9809 marked this conversation as resolved.
Show resolved Hide resolved
std::string cur_tok;
size_t cur = 0;
for (size_t cur = 0; cur < descriptor.size(); ++cur) {
gibber9809 marked this conversation as resolved.
Show resolved Hide resolved
if ('\\' == descriptor[cur]) {
++cur;
if (cur >= descriptor.size()) {
return false;
gibber9809 marked this conversation as resolved.
Show resolved Hide resolved
}
gibber9809 marked this conversation as resolved.
Show resolved Hide resolved
} else if ('.' == descriptor[cur]) {
tokens.push_back(cur_tok);
cur_tok.clear();
continue;
}
gibber9809 marked this conversation as resolved.
Show resolved Hide resolved
cur_tok.push_back(descriptor[cur]);
}

if (cur_tok.empty()) {
return false;
gibber9809 marked this conversation as resolved.
Show resolved Hide resolved
}
tokens.push_back(descriptor.substr(start));

tokens.push_back(cur_tok);
return true;
}
} // namespace clp_s
3 changes: 2 additions & 1 deletion components/core/src/clp_s/Utils.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -212,8 +212,9 @@ class StringUtils {
* @param descriptor
* @param tokens
* @return the list of tokens pushed into the 'tokens' parameter
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should we use @return here?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Taking that out.

* @return true if the descriptor was tokenized successfully, false otherwise
*/
static void
[[nodiscard]] static bool
tokenize_column_descriptor(std::string const& descriptor, std::vector<std::string>& tokens);

private:
Expand Down
5 changes: 4 additions & 1 deletion components/core/src/clp_s/clp-s.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -190,7 +190,10 @@ bool search_archive(
try {
for (auto const& column : command_line_arguments.get_projection_columns()) {
std::vector<std::string> descriptor_tokens;
StringUtils::tokenize_column_descriptor(column, descriptor_tokens);
if (false == StringUtils::tokenize_column_descriptor(column, descriptor_tokens)) {
SPDLOG_ERROR("Can not tokenize invalid column: \"{}\"", column);
return false;
}
projection->add_column(ColumnDescriptor::create(descriptor_tokens));
}
} catch (clp_s::TraceableException& e) {
Expand Down
2 changes: 1 addition & 1 deletion components/core/src/clp_s/search/kql/Kql.g4
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,7 @@ fragment ESCAPED_SPACE
;

fragment SPECIAL_CHARACTER
: [\\():<>"*?{}]
: [\\():<>"*?{}.]
;


Expand Down
11 changes: 9 additions & 2 deletions components/core/src/clp_s/search/kql/kql.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -112,7 +112,10 @@ class ParseTreeVisitor : public KqlBaseVisitor {
std::string column = unquote_string(ctx->LITERAL()->getText());

std::vector<std::string> descriptor_tokens;
StringUtils::tokenize_column_descriptor(column, descriptor_tokens);
if (false == StringUtils::tokenize_column_descriptor(column, descriptor_tokens)) {
SPDLOG_ERROR("Can not tokenize invalid column: \"{}\"", column);
return nullptr;
}

return ColumnDescriptor::create(descriptor_tokens);
}
Expand Down Expand Up @@ -248,6 +251,10 @@ std::shared_ptr<Expression> parse_kql_expression(std::istream& in) {
}

ParseTreeVisitor visitor;
return std::any_cast<std::shared_ptr<Expression>>(visitor.visitStart(tree));
try {
return std::any_cast<std::shared_ptr<Expression>>(visitor.visitStart(tree));
} catch (std::exception& e) {
return {};
}
}
} // namespace clp_s::search::kql
24 changes: 24 additions & 0 deletions components/core/tests/test-kql.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -187,4 +187,28 @@ TEST_CASE("Test parsing KQL", "[KQL]") {
auto failure = parse_kql_expression(incorrect_query);
REQUIRE(nullptr == failure);
}

SECTION("Escape sequences in column name") {
auto query = GENERATE(
"a\\.b.c: *",
"\"a\\.b.c\": *",
"a\\.b: {c: *}",
"\"a\\.b\": {\"c\": *}"
);
stringstream escaped_column_query{query};
auto filter
= std::dynamic_pointer_cast<FilterExpr>(parse_kql_expression(escaped_column_query));
REQUIRE(nullptr != filter);
REQUIRE(nullptr != filter->get_operand());
REQUIRE(nullptr != filter->get_column());
REQUIRE(false == filter->has_only_expression_operands());
REQUIRE(false == filter->is_inverted());
REQUIRE(FilterOperation::EQ == filter->get_operation());
REQUIRE(2 == filter->get_column()->get_descriptor_list().size());
auto it = filter->get_column()->descriptor_begin();
SPDLOG_INFO("{}", it->get_token());
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we need to print logs for the test case?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No. Will remove this.

REQUIRE(DescriptorToken{"a.b"} == *it++);
SPDLOG_INFO("{}", it->get_token());
REQUIRE(DescriptorToken{"c"} == *it++);
}
}
Loading