[Obs AI Assistant] Remove ES|QL escaping for index names (elastic#183028

) There were some last minute changes to ES|QL that will ship in 8.14 that we need to take into account: - [BUCKET is now an aggregation function](elastic/elasticsearch#107272) - [index names can no longer be escaped with backticks ](elastic/elasticsearch#108431) I'm also including a change that translates `=` to `==` in WHERE commands, and more useful error messages (map a syntax error to the command where it occurred). As BUCKET is often used for timeseries data and we replaced single and double quotes around index names with backticks, this introduces a high chance of generating syntactically invalid queries. This PR updates the docs and examples and removes the correction from `"` and `'` to "`"
kibanamachine · May 9, 2024 · 6004cad · 6004cad
1 parent 24a34a4
commit 6004cad
Show file tree

Hide file tree

Showing 63 changed files with 468 additions and 380 deletions.
diff --git a/...bservability_ai_assistant_app/server/functions/query/correct_common_esql_mistakes.test.ts b/...bservability_ai_assistant_app/server/functions/query/correct_common_esql_mistakes.test.ts
@@ -25,6 +25,9 @@ describe('correctCommonEsqlMistakes', () => {
 
   it('replaces aliasing via the AS keyword with the = operator', () => {
     expectQuery(`FROM logs-* | STATS COUNT() AS count`, 'FROM logs-*\n| STATS count = COUNT()');
+
+    expectQuery(`FROM logs-* | STATS COUNT() as count`, 'FROM logs-*\n| STATS count = COUNT()');
+
     expectQuery(
       `FROM logs-* | STATS AVG(transaction.duration.histogram) AS avg_request_latency, PERCENTILE(transaction.duration.histogram, 95) AS p95`,
       `FROM logs-*
@@ -42,11 +45,33 @@ describe('correctCommonEsqlMistakes', () => {
   });
 
   it(`replaces " or ' escaping in FROM statements with backticks`, () => {
-    expectQuery(`FROM "logs-*" | LIMIT 10`, 'FROM `logs-*`\n| LIMIT 10');
-    expectQuery(`FROM 'logs-*' | LIMIT 10`, 'FROM `logs-*`\n| LIMIT 10');
+    expectQuery(`FROM "logs-*" | LIMIT 10`, 'FROM logs-*\n| LIMIT 10');
+    expectQuery(`FROM 'logs-*' | LIMIT 10`, 'FROM logs-*\n| LIMIT 10');
     expectQuery(`FROM logs-* | LIMIT 10`, 'FROM logs-*\n| LIMIT 10');
   });
 
+  it('replaces = as equal operator with ==', () => {
+    expectQuery(
+      `FROM logs-*\n| WHERE service.name = "foo"`,
+      `FROM logs-*\n| WHERE service.name == "foo"`
+    );
+
+    expectQuery(
+      `FROM logs-*\n| WHERE service.name = "foo" AND service.environment = "bar"`,
+      `FROM logs-*\n| WHERE service.name == "foo" AND service.environment == "bar"`
+    );
+
+    expectQuery(
+      `FROM logs-*\n| WHERE (service.name = "foo" AND service.environment = "bar") OR agent.name = "baz"`,
+      `FROM logs-*\n| WHERE (service.name == "foo" AND service.environment == "bar") OR agent.name == "baz"`
+    );
+
+    expectQuery(
+      `FROM logs-*\n| WHERE \`what=ever\` = "foo=bar"`,
+      `FROM logs-*\n| WHERE \`what=ever\` == "foo=bar"`
+    );
+  });
+
   it('replaces single-quote escaped strings with double-quote escaped strings', () => {
     expectQuery(
       `FROM nyc_taxis
@@ -102,7 +127,7 @@ describe('correctCommonEsqlMistakes', () => {
       | EVAL "@timestamp" = TO_DATETIME(timestamp)
       | WHERE statement LIKE 'SELECT%'
       | STATS avg_duration = AVG(duration)`,
-      `FROM \`postgres-logs*\`
+      `FROM postgres-logs*
     | GROK message "%{TIMESTAMP_ISO8601:timestamp} %{TZ} \[%{NUMBER:process_id}\]: \[%{NUMBER:log_line}\] user=%{USER:user},db=%{USER:database},app=\[%{DATA:application}\],client=%{IP:client_ip} LOG:  duration: %{NUMBER:duration:float} ms  statement: %{GREEDYDATA:statement}"
     | EVAL @timestamp = TO_DATETIME(timestamp)
     | WHERE statement LIKE "SELECT%"

diff --git a/...ion/observability_ai_assistant_app/server/functions/query/correct_common_esql_mistakes.ts b/...ion/observability_ai_assistant_app/server/functions/query/correct_common_esql_mistakes.ts
@@ -62,7 +62,7 @@ function split(value: string, splitToken: string) {
   return statements;
 }
 
-function splitIntoCommands(query: string) {
+export function splitIntoCommands(query: string) {
   const commands: string[] = split(query, '|');
 
   return commands.map((command) => {
@@ -93,8 +93,8 @@ function removeColumnQuotesAndEscape(column: string) {
 function replaceAsKeywordWithAssignments(command: string) {
   return command.replaceAll(/^STATS\s*(.*)/g, (__, statsOperations: string) => {
     return `STATS ${statsOperations.replaceAll(
-      /(,\s*)?(.*?)\sAS\s([`a-zA-Z0-9.\-_]+)/g,
-      '$1$3 = $2'
+      /(,\s*)?(.*?)\s(AS|as)\s([`a-zA-Z0-9.\-_]+)/g,
+      '$1$4 = $2'
     )}`;
   });
 }
@@ -196,6 +196,30 @@ function escapeExpressionsInSort(sortCommand: string) {
   return `SORT ${columnsInSort.join(', ')}`;
 }
 
+function ensureEqualityOperators(whereCommand: string) {
+  const body = whereCommand.split(/^WHERE /)[1];
+
+  const byChar = body.split('');
+
+  let next = '';
+  let isColumnName = false;
+  byChar.forEach((char, index) => {
+    next += char;
+
+    if (!isColumnName && char === '=' && byChar[index - 1] === ' ' && byChar[index + 1] === ' ') {
+      next += '=';
+    }
+
+    if (!isColumnName && (char === '`' || char.match(/[a-z@]/i))) {
+      isColumnName = true;
+    } else if (isColumnName && (char === '`' || !char.match(/[a-z@0-9]/i))) {
+      isColumnName = false;
+    }
+  });
+
+  return `WHERE ${next}`;
+}
+
 export function correctCommonEsqlMistakes(content: string, log: Logger) {
   return content.replaceAll(/```esql\n(.*?)\n```/gms, (_, query: string) => {
     const commands = splitIntoCommands(query.trim());
@@ -206,12 +230,14 @@ export function correctCommonEsqlMistakes(content: string, log: Logger) {
       switch (name) {
         case 'FROM':
           formattedCommand = formattedCommand
-            .replaceAll(/FROM "(.*)"/g, 'FROM `$1`')
-            .replaceAll(/FROM '(.*)'/g, 'FROM `$1`');
+            .replaceAll(/FROM "(.*)"/g, 'FROM $1')
+            .replaceAll(/FROM '(.*)'/g, 'FROM $1')
+            .replaceAll(/FROM `(.*)`/g, 'FROM $1');
           break;
 
         case 'WHERE':
           formattedCommand = replaceSingleQuotesWithDoubleQuotes(formattedCommand);
+          formattedCommand = ensureEqualityOperators(formattedCommand);
           break;
 
         case 'EVAL':

diff --git a/..._solution/observability_ai_assistant_app/server/functions/query/esql_docs/esql-bucket.txt b/..._solution/observability_ai_assistant_app/server/functions/query/esql_docs/esql-bucket.txt
@@ -1,7 +1,6 @@
 BUCKET
 
 Syntax
-BUCKET(expression, buckets, from, to)
 Parameters
 field
 Numeric or date expression from which to derive buckets.
@@ -11,73 +10,94 @@ from
 Start of the range. Can be a number or a date expressed as a string.
 to
 End of the range. Can be a number or a date expressed as a string.
-DescriptionCreates human-friendly buckets and returns a value for each row that corresponds
-to the resulting bucket the row falls into.Using a target number of buckets, a start of a range, and an end of a range,
-BUCKET picks an appropriate bucket size to generate the target number of
-buckets or fewer. For example, asking for at most 20 buckets over a year results
-in monthly buckets:
+DescriptionCreates groups of values - buckets - out of a datetime or numeric input. The size of the buckets can either be provided directly, or chosen based on a recommended count and values range.Supported types
+ExamplesBUCKET can work in two modes: one in which the size of the bucket is computed
+based on a buckets count recommendation (four parameters) and a range, and
+another in which the bucket size is provided directly (two parameters).Using a target number of buckets, a start of a range, and an end of a range,
+BUCKET picks an appropriate bucket size to generate the target number of buckets or fewer.
+For example, asking for at most 20 buckets over a year results in monthly buckets:
 ```esql
 FROM employees
 | WHERE hire_date >= "1985-01-01T00:00:00Z" AND hire_date < "1986-01-01T00:00:00Z"
-| EVAL month = BUCKET(hire_date, 20, "1985-01-01T00:00:00Z", "1986-01-01T00:00:00Z")
-| KEEP hire_date, month
+| STATS hire_date = MV_SORT(VALUES(hire_date)) BY month = BUCKET(hire_date, 20, "1985-01-01T00:00:00Z", "1986-01-01T00:00:00Z")
 | SORT hire_date
 ```
 
-The goal isn’t to provide exactly the target number of buckets, it’s to pick a
-range that people are comfortable with that provides at most the target number
-of buckets.Combine BUCKET with
-STATS ... BY to create a histogram:
+The goal isn’t to provide exactly the target number of buckets,
+it’s to pick a range that people are comfortable with that provides at most the target number of buckets.Combine BUCKET with an aggregation to create a histogram:
 ```esql
 FROM employees
 | WHERE hire_date >= "1985-01-01T00:00:00Z" AND hire_date < "1986-01-01T00:00:00Z"
-| EVAL month = BUCKET(hire_date, 20, "1985-01-01T00:00:00Z", "1986-01-01T00:00:00Z")
-| STATS hires_per_month = COUNT(*) BY month
+| STATS hires_per_month = COUNT(*) BY month = BUCKET(hire_date, 20, "1985-01-01T00:00:00Z", "1986-01-01T00:00:00Z")
 | SORT month
 ```
 
 BUCKET does not create buckets that don’t match any documents.
 That’s why this example is missing 1985-03-01 and other dates.
-Asking for more buckets can result in a smaller range. For example, asking for
-at most 100 buckets in a year results in weekly buckets:
+Asking for more buckets can result in a smaller range.
+For example, asking for at most 100 buckets in a year results in weekly buckets:
 ```esql
 FROM employees
 | WHERE hire_date >= "1985-01-01T00:00:00Z" AND hire_date < "1986-01-01T00:00:00Z"
-| EVAL week = BUCKET(hire_date, 100, "1985-01-01T00:00:00Z", "1986-01-01T00:00:00Z")
-| STATS hires_per_week = COUNT(*) BY week
+| STATS hires_per_week = COUNT(*) BY week = BUCKET(hire_date, 100, "1985-01-01T00:00:00Z", "1986-01-01T00:00:00Z")
 | SORT week
 ```
 
-BUCKET does not filter any rows. It only uses the provided range to
-pick a good bucket size. For rows with a value outside of the range, it returns
-a bucket value that corresponds to a bucket outside the range. Combine
-BUCKET with WHERE to filter rows.
-BUCKET can also operate on numeric fields. For example, to create a
-salary histogram:
+BUCKET does not filter any rows. It only uses the provided range to pick a good bucket size.
+For rows with a value outside of the range, it returns a bucket value that corresponds to a bucket outside the range.
+Combine`BUCKET` with WHERE to filter rows.
+If the desired bucket size is known in advance, simply provide it as the second
+argument, leaving the range out:
 ```esql
 FROM employees
-| EVAL bs = BUCKET(salary, 20, 25324, 74999)
-| STATS COUNT(*) by bs
+| WHERE hire_date >= "1985-01-01T00:00:00Z" AND hire_date < "1986-01-01T00:00:00Z"
+| STATS hires_per_week = COUNT(*) BY week = BUCKET(hire_date, 1 week)
+| SORT week
+```
+
+When providing the bucket size as the second parameter, it must be a time
+duration or date period.
+BUCKET can also operate on numeric fields. For example, to create a salary histogram:
+```esql
+FROM employees
+| STATS COUNT(*) by bs = BUCKET(salary, 20, 25324, 74999)
 | SORT bs
 ```
 
-Unlike the earlier example that intentionally filters on a date range, you
-rarely want to filter on a numeric range. You have to find the min and max
-separately. ES|QL doesn’t yet have an easy way to do that automatically.ExamplesCreate hourly buckets for the last 24 hours, and calculate the number of events
-per hour:
+Unlike the earlier example that intentionally filters on a date range, you rarely want to filter on a numeric range.
+You have to find the min and max separately. ES|QL doesn’t yet have an easy way to do that automatically.The range can be omitted if the desired bucket size is known in advance. Simply
+provide it as the second argument:
+```esql
+FROM employees
+| WHERE hire_date >= "1985-01-01T00:00:00Z" AND hire_date < "1986-01-01T00:00:00Z"
+| STATS c = COUNT(1) BY b = BUCKET(salary, 5000.)
+| SORT b
+```
+
+When providing the bucket size as the second parameter, it must be
+of a floating point type.
+Create hourly buckets for the last 24 hours, and calculate the number of events per hour:
 ```esql
 FROM sample_data
 | WHERE @timestamp >= NOW() - 1 day and @timestamp < NOW()
-| EVAL bucket = BUCKET(@timestamp, 25, NOW() - 1 day, NOW())
-| STATS COUNT(*) BY bucket
+| STATS COUNT(*) BY bucket = BUCKET(@timestamp, 25, NOW() - 1 day, NOW())
 ```
 
-Create monthly buckets for the year 1985, and calculate the average salary by
-hiring month:
+Create monthly buckets for the year 1985, and calculate the average salary by hiring month
 ```esql
 FROM employees
 | WHERE hire_date >= "1985-01-01T00:00:00Z" AND hire_date < "1986-01-01T00:00:00Z"
-| EVAL bucket = BUCKET(hire_date, 20, "1985-01-01T00:00:00Z", "1986-01-01T00:00:00Z")
-| STATS AVG(salary) BY bucket
+| STATS AVG(salary) BY bucket = BUCKET(hire_date, 20, "1985-01-01T00:00:00Z", "1986-01-01T00:00:00Z")
 | SORT bucket
 ```
+
+BUCKET may be used in both the aggregating and grouping part of the
+STATS … BY … command provided that in the aggregating
+part the function is referenced by an alias defined in the
+grouping part, or that it is invoked with the exact same expression:
+```esql
+FROM employees
+| STATS s1 = b1 + 1, s2 = BUCKET(salary / 1000 + 999, 50.) + 2 BY b1 = BUCKET(salary / 100 + 99, 50.), b2 = BUCKET(salary / 1000 + 999, 50.)
+| SORT b1, b2
+| KEEP s1, b1, s2, b2
+```
diff --git a/...lution/observability_ai_assistant_app/server/functions/query/esql_docs/esql-cast-(::).txt b/...lution/observability_ai_assistant_app/server/functions/query/esql_docs/esql-cast-(::).txt
@@ -0,0 +1,7 @@
+Cast (::)
+
+The :: operator provides a convenient alternative syntax to the TO_<type>
+conversion functions.
+```esql
+ROW ver = CONCAT(("0"::INT + 1)::STRING, ".2.3")::VERSION
+```
diff --git a/..._solution/observability_ai_assistant_app/server/functions/query/esql_docs/esql-concat.txt b/..._solution/observability_ai_assistant_app/server/functions/query/esql_docs/esql-concat.txt
@@ -1,11 +1,13 @@
 CONCAT
 
 Syntax
-CONCAT(string1, string2[, ..., stringN])
 Parameters
-stringX
+string1
 Strings to concatenate.
-DescriptionConcatenates two or more strings.Example
+string2
+Strings to concatenate.
+DescriptionConcatenates two or more strings.Supported types
+Example
 ```esql
 FROM employees
 | KEEP first_name, last_name

diff --git a/...lution/observability_ai_assistant_app/server/functions/query/esql_docs/esql-date_diff.txt b/...lution/observability_ai_assistant_app/server/functions/query/esql_docs/esql-date_diff.txt
@@ -3,15 +3,17 @@ DATE_DIFF
 Syntax
 Parameters
 unit
-Time difference unit.
+Time difference unit
 startTimestamp
-Start timestamp.
+A string representing a start timestamp
 endTimestamp
-End timestamp.
-DescriptionSubtracts the startTimestamp from the endTimestamp and returns the
-difference in multiples of unit. If startTimestamp is later than the
-endTimestamp, negative values are returned.
-Supported types
+A string representing an end timestamp
+DescriptionSubtracts the startTimestamp from the endTimestamp and returns the difference in multiples of unit. If startTimestamp is later than the endTimestamp, negative values are returned.
+Note that while there is an overlap between the function’s supported units and
+ES|QL’s supported time span literals, these sets are distinct and not
+interchangeable. Similarly, the supported abbreviations are conveniently shared
+with implementations of this function in other established products and not
+necessarily common with the date-time nomenclature used by Elasticsearch.Supported types
 Example
 ```esql
 ROW date1 = TO_DATETIME("2023-12-02T11:00:00.000Z"), date2 = TO_DATETIME("2023-12-02T11:00:00.001Z")

diff --git a/...ion/observability_ai_assistant_app/server/functions/query/esql_docs/esql-date_extract.txt b/...ion/observability_ai_assistant_app/server/functions/query/esql_docs/esql-date_extract.txt
@@ -1,30 +1,19 @@
 DATE_EXTRACT
 
 Syntax
-DATE_EXTRACT(date_part, date)
 Parameters
-date_part
-Part of the date to extract. Can be: aligned_day_of_week_in_month,
-aligned_day_of_week_in_year, aligned_week_of_month, aligned_week_of_year,
-ampm_of_day, clock_hour_of_ampm, clock_hour_of_day, day_of_month,
-day_of_week, day_of_year, epoch_day, era, hour_of_ampm, hour_of_day,
-instant_seconds, micro_of_day, micro_of_second, milli_of_day,
-milli_of_second, minute_of_day, minute_of_hour, month_of_year,
-nano_of_day, nano_of_second, offset_seconds, proleptic_month,
-second_of_day, second_of_minute, year, or year_of_era. Refer to
-java.time.temporal.ChronoField
-for a description of these values.
-If null, the function returns null.
+datePart
+Part of the date to extract.  Can be: aligned_day_of_week_in_month, aligned_day_of_week_in_year, aligned_week_of_month, aligned_week_of_year, ampm_of_day, clock_hour_of_ampm, clock_hour_of_day, day_of_month, day_of_week, day_of_year, epoch_day, era, hour_of_ampm, hour_of_day, instant_seconds, micro_of_day, micro_of_second, milli_of_day, milli_of_second, minute_of_day, minute_of_hour, month_of_year, nano_of_day, nano_of_second, offset_seconds, proleptic_month, second_of_day, second_of_minute, year, or year_of_era. Refer to java.time.temporal.ChronoField for a description of these values.  If null, the function returns null.
 date
 Date expression. If null, the function returns null.
-DescriptionExtracts parts of a date, like year, month, day, hour.Examples
+DescriptionExtracts parts of a date, like year, month, day, hour.Supported types
+Examples
 ```esql
 ROW date = DATE_PARSE("yyyy-MM-dd", "2022-05-06")
 | EVAL year = DATE_EXTRACT("year", date)
 ```
 
-Find all events that occurred outside of business hours (before 9 AM or after 5
-PM), on any given date:
+Find all events that occurred outside of business hours (before 9 AM or after 5PM), on any given date:
 ```esql
 FROM sample_data
 | WHERE DATE_EXTRACT("hour_of_day", @timestamp) < 9 AND DATE_EXTRACT("hour_of_day", @timestamp) >= 17

diff --git a/...tion/observability_ai_assistant_app/server/functions/query/esql_docs/esql-date_format.txt b/...tion/observability_ai_assistant_app/server/functions/query/esql_docs/esql-date_format.txt
@@ -1,15 +1,13 @@
 DATE_FORMAT
 
 Syntax
-DATE_FORMAT([format,] date)
 Parameters
-format
-Date format (optional).  If no format is specified, the
-yyyy-MM-dd'T'HH:mm:ss.SSSZ format is used.  If null, the function returns
-null.
+dateFormat
+Date format (optional).  If no format is specified, the yyyy-MM-dd'T'HH:mm:ss.SSSZ format is used. If null, the function returns null.
 date
 Date expression. If null, the function returns null.
-DescriptionReturns a string representation of a date, in the provided format.Example
+DescriptionReturns a string representation of a date, in the provided format.Supported types
+Example
 ```esql
 FROM employees
 | KEEP first_name, last_name, hire_date

diff --git a/...ution/observability_ai_assistant_app/server/functions/query/esql_docs/esql-date_parse.txt b/...ution/observability_ai_assistant_app/server/functions/query/esql_docs/esql-date_parse.txt
@@ -1,17 +1,13 @@
 DATE_PARSE
 
 Syntax
-DATE_PARSE([format,] date_string)
 Parameters
-format
-The date format. Refer to the
-DateTimeFormatter
-documentation for the syntax. If null, the function returns null.
-date_string
-Date expression as a string. If null or an empty string, the function returns
-null.
-DescriptionReturns a date by parsing the second argument using the format specified in the
-first argument.Example
+datePattern
+The date format. Refer to the DateTimeFormatter documentation for the syntax. If null, the function returns null.
+dateString
+Date expression as a string. If null or an empty string, the function returns null.
+DescriptionReturns a date by parsing the second argument using the format specified in the first argument.Supported types
+Example
 ```esql
 ROW date_string = "2022-05-06"
 | EVAL date = DATE_PARSE("yyyy-MM-dd", date_string)