y-scope · gibber9809 · Nov 13, 2024 · Oct 21, 2024 · Oct 21, 2024 · Oct 22, 2024
diff --git a/components/core/src/clp_s/JsonParser.cpp b/components/core/src/clp_s/JsonParser.cpp
@@ -21,7 +21,12 @@ JsonParser::JsonParser(JsonParserOption const& option)
     }
 
     if (false == m_timestamp_key.empty()) {
-        clp_s::StringUtils::tokenize_column_descriptor(m_timestamp_key, m_timestamp_column);
+        if (false
+            == clp_s::StringUtils::tokenize_column_descriptor(m_timestamp_key, m_timestamp_column))
+        {
+            SPDLOG_ERROR("Can not parse invalid timestamp key: \"{}\"", m_timestamp_key);
+            throw OperationFailed(ErrorCodeBadParam, __FILENAME__, __LINE__);
+        }
     }
 
     for (auto& file_path : option.file_paths) {

diff --git a/components/core/src/clp_s/TimestampDictionaryReader.cpp b/components/core/src/clp_s/TimestampDictionaryReader.cpp
@@ -44,7 +44,9 @@ void TimestampDictionaryReader::read_new_entries() {
         TimestampEntry entry;
         std::vector<std::string> tokens;
         entry.try_read_from_file(m_dictionary_decompressor);
-        StringUtils::tokenize_column_descriptor(entry.get_key_name(), tokens);
+        if (false == StringUtils::tokenize_column_descriptor(entry.get_key_name(), tokens)) {
+            throw OperationFailed(ErrorCodeCorrupt, __FILENAME__, __LINE__);
+        }
         m_entries.emplace_back(std::move(entry));
 
         // TODO: Currently, we only allow a single authoritative timestamp column at ingestion time,

diff --git a/components/core/src/clp_s/Utils.cpp b/components/core/src/clp_s/Utils.cpp
@@ -427,18 +427,34 @@ bool StringUtils::convert_string_to_double(std::string const& raw, double& conve
     return true;
 }
 
-void StringUtils::tokenize_column_descriptor(
+bool StringUtils::tokenize_column_descriptor(
         std::string const& descriptor,
         std::vector<std::string>& tokens
 ) {
-    // TODO: handle escaped . correctly
-    auto start = 0U;
-    auto end = descriptor.find('.');
-    while (end != std::string::npos) {
-        tokens.push_back(descriptor.substr(start, end - start));
-        start = end + 1;
-        end = descriptor.find('.', start);
+    // TODO: add support for unicode sequences e.g. \u263A
+    std::string cur_tok;
+    for (size_t cur = 0; cur < descriptor.size(); ++cur) {
+        if ('\\' == descriptor[cur]) {
+            ++cur;
+            if (cur >= descriptor.size()) {
+                return false;
+            }
+        } else if ('.' == descriptor[cur]) {
+            if (cur_tok.empty()) {
+                return false;
+            }
+            tokens.push_back(cur_tok);
+            cur_tok.clear();
+            continue;
+        }
+        cur_tok.push_back(descriptor[cur]);
     }
-    tokens.push_back(descriptor.substr(start));
+
+    if (cur_tok.empty()) {
+        return false;
+    }
+
+    tokens.push_back(cur_tok);
+    return true;
 }
 }  // namespace clp_s
diff --git a/components/core/src/clp_s/Utils.hpp b/components/core/src/clp_s/Utils.hpp
@@ -211,9 +211,9 @@ class StringUtils {
      * Converts a string column descriptor delimited by '.' into a list of tokens
      * @param descriptor
      * @param tokens
-     * @return the list of tokens pushed into the 'tokens' parameter
+     * @return true if the descriptor was tokenized successfully, false otherwise
      */
-    static void
+    [[nodiscard]] static bool
     tokenize_column_descriptor(std::string const& descriptor, std::vector<std::string>& tokens);
 
 private:

diff --git a/components/core/src/clp_s/clp-s.cpp b/components/core/src/clp_s/clp-s.cpp
@@ -191,7 +191,10 @@ bool search_archive(
     try {
         for (auto const& column : command_line_arguments.get_projection_columns()) {
             std::vector<std::string> descriptor_tokens;
-            StringUtils::tokenize_column_descriptor(column, descriptor_tokens);
+            if (false == StringUtils::tokenize_column_descriptor(column, descriptor_tokens)) {
+                SPDLOG_ERROR("Can not tokenize invalid column: \"{}\"", column);
+                return false;
+            }
             projection->add_column(ColumnDescriptor::create(descriptor_tokens));
         }
     } catch (clp_s::TraceableException& e) {

diff --git a/components/core/src/clp_s/search/kql/Kql.g4 b/components/core/src/clp_s/search/kql/Kql.g4
@@ -96,7 +96,7 @@ fragment ESCAPED_SPACE
     ;
 
 fragment SPECIAL_CHARACTER
-    : [\\():<>"*?{}]
+    : [\\():<>"*?{}.]
     ;
 
 

diff --git a/components/core/src/clp_s/search/kql/kql.cpp b/components/core/src/clp_s/search/kql/kql.cpp
@@ -112,7 +112,10 @@ class ParseTreeVisitor : public KqlBaseVisitor {
         std::string column = unquote_string(ctx->LITERAL()->getText());
 
         std::vector<std::string> descriptor_tokens;
-        StringUtils::tokenize_column_descriptor(column, descriptor_tokens);
+        if (false == StringUtils::tokenize_column_descriptor(column, descriptor_tokens)) {
+            SPDLOG_ERROR("Can not tokenize invalid column: \"{}\"", column);
+            return nullptr;
+        }
 
         return ColumnDescriptor::create(descriptor_tokens);
     }
@@ -248,6 +251,10 @@ std::shared_ptr<Expression> parse_kql_expression(std::istream& in) {
     }
 
     ParseTreeVisitor visitor;
-    return std::any_cast<std::shared_ptr<Expression>>(visitor.visitStart(tree));
+    try {
+        return std::any_cast<std::shared_ptr<Expression>>(visitor.visitStart(tree));
+    } catch (std::exception& e) {
+        return {};
+    }
 }
 }  // namespace clp_s::search::kql
diff --git a/components/core/tests/test-kql.cpp b/components/core/tests/test-kql.cpp
@@ -187,4 +187,49 @@ TEST_CASE("Test parsing KQL", "[KQL]") {
         auto failure = parse_kql_expression(incorrect_query);
         REQUIRE(nullptr == failure);
     }
+
+    SECTION("Escape sequences in column name") {
+        auto query = GENERATE(
+                "a\\.b.c: *",
+                "\"a\\.b.c\": *",
+                "a\\.b: {c: *}",
+                "\"a\\.b\": {\"c\": *}"
+        );
+        stringstream escaped_column_query{query};
+        auto filter
+                = std::dynamic_pointer_cast<FilterExpr>(parse_kql_expression(escaped_column_query));
+        REQUIRE(nullptr != filter);
+        REQUIRE(nullptr != filter->get_operand());
+        REQUIRE(nullptr != filter->get_column());
+        REQUIRE(false == filter->has_only_expression_operands());
+        REQUIRE(false == filter->is_inverted());
+        REQUIRE(FilterOperation::EQ == filter->get_operation());
+        REQUIRE(2 == filter->get_column()->get_descriptor_list().size());
+        auto it = filter->get_column()->descriptor_begin();
+        REQUIRE(DescriptorToken{"a.b"} == *it++);
+        REQUIRE(DescriptorToken{"c"} == *it++);
+    }
+
+    SECTION("Illegal escape sequences in column name") {
+        auto query = GENERATE(
+                //"a\\:*", this case is technically legal since ':' gets escaped
+                "\"a\\\":*",
+                "a\\ :*",
+                "\"a\\\" :*",
+                "a.:*",
+                "\"a.\":*",
+                "a. :*",
+                "\"a.\" :*"
+        );
+        stringstream illegal_escape{query};
+        auto filter = parse_kql_expression(illegal_escape);
+        REQUIRE(nullptr == filter);
+    }
+
+    SECTION("Empty token in column name") {
+        auto query = GENERATE(".a:*", "a.:*", "a..c:*", "a.b.:*");
+        stringstream empty_token_column{query};
+        auto filter = parse_kql_expression(empty_token_column);
+        REQUIRE(nullptr == filter);
+    }
 }
-Original file line number
+Diff line change
@@ Expand Up / @@ -96,7 +96,7 @@ fragment ESCAPED_SPACE @@
         ;
     fragment SPECIAL_CHARACTER
-        : [\\():<>"*?{}]
+        : [\\():<>"*?{}.]
         ;
@@ Expand Down @@