diff --git a/.chloggen/add_multiline_pattern_omit_config.yaml b/.chloggen/add_multiline_pattern_omit_config.yaml new file mode 100644 index 000000000000..6ff655041a20 --- /dev/null +++ b/.chloggen/add_multiline_pattern_omit_config.yaml @@ -0,0 +1,17 @@ +# One of 'breaking', 'deprecation', 'new_component', 'enhancement', 'bug_fix' +change_type: enhancement + +# The name of the component, or a single word describing the area of concern, (e.g. filelogreceiver) +component: pkg/stanza + +# A brief description of the change. Surround your text with quotes ("") if it needs to start with a backtick (`). +note: Add 'omit_pattern' setting to `split.Config`. + +# One or more tracking issues related to the change +issues: [26381] + +# (Optional) One or more lines of additional information to render under the primary note. +# These lines will be padded with 2 spaces and then inserted directly into the document. +# Use pipe (|) for multiline entries. +subtext: | + This can be used omit the start or end pattern from a log entry. \ No newline at end of file diff --git a/pkg/stanza/docs/operators/file_input.md b/pkg/stanza/docs/operators/file_input.md index d039c88baf14..a2b93f56db2f 100644 --- a/pkg/stanza/docs/operators/file_input.md +++ b/pkg/stanza/docs/operators/file_input.md @@ -44,6 +44,8 @@ If set, the `multiline` configuration block instructs the `file_input` operator The `multiline` configuration block must contain exactly one of `line_start_pattern` or `line_end_pattern`. These are regex patterns that match either the beginning of a new log entry, or the end of a log entry. +The `omit_pattern` setting can be used to omit the start/end pattern from each entry. + If using multiline, last log can sometimes be not flushed due to waiting for more content. In order to forcefully flush last buffered log after certain period of time, use `force_flush_period` option. diff --git a/pkg/stanza/docs/operators/tcp_input.md b/pkg/stanza/docs/operators/tcp_input.md index ed690a432bcc..37197f49fcb1 100644 --- a/pkg/stanza/docs/operators/tcp_input.md +++ b/pkg/stanza/docs/operators/tcp_input.md @@ -39,6 +39,8 @@ If set, the `multiline` configuration block instructs the `tcp_input` operator t The `multiline` configuration block must contain exactly one of `line_start_pattern` or `line_end_pattern`. These are regex patterns that match either the beginning of a new log entry, or the end of a log entry. +The `omit_pattern` setting can be used to omit the start/end pattern from each entry. + #### Supported encodings | Key | Description diff --git a/pkg/stanza/docs/operators/udp_input.md b/pkg/stanza/docs/operators/udp_input.md index 8697b3f49427..4a31a02718d9 100644 --- a/pkg/stanza/docs/operators/udp_input.md +++ b/pkg/stanza/docs/operators/udp_input.md @@ -28,6 +28,8 @@ If set, the `multiline` configuration block instructs the `udp_input` operator t The `multiline` configuration block must contain exactly one of `line_start_pattern` or `line_end_pattern`. These are regex patterns that match either the beginning of a new log entry, or the end of a log entry. +The `omit_pattern` setting can be used to omit the start/end pattern from each entry. + #### Supported encodings | Key | Description diff --git a/pkg/stanza/split/split.go b/pkg/stanza/split/split.go index 976a70bf90f7..b5883c14841f 100644 --- a/pkg/stanza/split/split.go +++ b/pkg/stanza/split/split.go @@ -16,6 +16,7 @@ import ( type Config struct { LineStartPattern string `mapstructure:"line_start_pattern"` LineEndPattern string `mapstructure:"line_end_pattern"` + OmitPattern bool `mapstructure:"omit_pattern"` } // Func will return a bufio.SplitFunc based on the config @@ -37,20 +38,20 @@ func (c Config) Func(enc encoding.Encoding, flushAtEOF bool, maxLogSize int) (sp if err != nil { return nil, fmt.Errorf("compile line end regex: %w", err) } - splitFunc = LineEndSplitFunc(re, flushAtEOF) + splitFunc = LineEndSplitFunc(re, c.OmitPattern, flushAtEOF) case c.LineStartPattern != "": re, err := regexp.Compile("(?m)" + c.LineStartPattern) if err != nil { return nil, fmt.Errorf("compile line start regex: %w", err) } - splitFunc = LineStartSplitFunc(re, flushAtEOF) + splitFunc = LineStartSplitFunc(re, c.OmitPattern, flushAtEOF) } return splitFunc, nil } // LineStartSplitFunc creates a bufio.SplitFunc that splits an incoming stream into // tokens that start with a match to the regex pattern provided -func LineStartSplitFunc(re *regexp.Regexp, flushAtEOF bool) bufio.SplitFunc { +func LineStartSplitFunc(re *regexp.Regexp, omitPattern bool, flushAtEOF bool) bufio.SplitFunc { return func(data []byte, atEOF bool) (advance int, token []byte, err error) { firstLoc := re.FindIndex(data) if firstLoc == nil { @@ -81,6 +82,10 @@ func LineStartSplitFunc(re *regexp.Regexp, flushAtEOF bool) bufio.SplitFunc { // Flush if no more data is expected if atEOF && flushAtEOF { + if omitPattern { + return len(data), data[firstMatchEnd:], nil + } + return len(data), data, nil } @@ -90,6 +95,9 @@ func LineStartSplitFunc(re *regexp.Regexp, flushAtEOF bool) bufio.SplitFunc { return 0, nil, nil // read more data and try again } secondMatchStart := secondLoc[0] + secondLocOfset + if omitPattern { + return secondMatchStart, data[firstMatchEnd:secondMatchStart], nil + } // start scanning at the beginning of the second match // the token begins at the first match, and ends at the beginning of the second match @@ -99,7 +107,7 @@ func LineStartSplitFunc(re *regexp.Regexp, flushAtEOF bool) bufio.SplitFunc { // LineEndSplitFunc creates a bufio.SplitFunc that splits an incoming stream into // tokens that end with a match to the regex pattern provided -func LineEndSplitFunc(re *regexp.Regexp, flushAtEOF bool) bufio.SplitFunc { +func LineEndSplitFunc(re *regexp.Regexp, omitPattern bool, flushAtEOF bool) bufio.SplitFunc { return func(data []byte, atEOF bool) (advance int, token []byte, err error) { loc := re.FindIndex(data) if loc == nil { @@ -115,6 +123,11 @@ func LineEndSplitFunc(re *regexp.Regexp, flushAtEOF bool) bufio.SplitFunc { if loc[1] == len(data)-1 && !atEOF { return 0, nil, nil } + + if omitPattern { + return loc[1], data[:loc[0]], nil + } + return loc[1], data[:loc[1]], nil } } diff --git a/pkg/stanza/split/split_test.go b/pkg/stanza/split/split_test.go index 9af21034b1ca..9db4b7dbb39e 100644 --- a/pkg/stanza/split/split_test.go +++ b/pkg/stanza/split/split_test.go @@ -75,6 +75,15 @@ func TestLineStartSplitFunc(t *testing.T) { `LOGSTART 123 log1`, }, }, + { + Name: "OneLogSimpleOmitPattern", + Pattern: `LOGSTART \d+ `, + OmitPattern: true, + Input: []byte("LOGSTART 123 log1LOGSTART 123 a"), + ExpectedTokens: []string{ + `log1`, + }, + }, { Name: "TwoLogsSimple", Pattern: `LOGSTART \d+ `, @@ -84,6 +93,17 @@ func TestLineStartSplitFunc(t *testing.T) { `LOGSTART 234 log2 `, }, }, + { + + Name: "TwoLogsSimpleOmitPattern", + Pattern: `LOGSTART \d+ `, + OmitPattern: true, + Input: []byte(`LOGSTART 123 log1 LOGSTART 234 log2 LOGSTART 345 foo`), + ExpectedTokens: []string{ + `log1 `, + `log2 `, + }, + }, { Name: "TwoLogsLineStart", Pattern: `^LOGSTART \d+ `, @@ -93,11 +113,38 @@ func TestLineStartSplitFunc(t *testing.T) { "LOGSTART 234 log2\n", }, }, + { + Name: "TwoLogsLineStartOmitPattern", + Pattern: `^LOGSTART \d+ `, + OmitPattern: true, + Input: []byte("LOGSTART 123 LOGSTART 345 log1\nLOGSTART 234 log2\nLOGSTART 345 foo"), + ExpectedTokens: []string{ + "LOGSTART 345 log1\n", + "log2\n", + }, + }, + { + Name: "TwoLogsLineStartOmitPatternNoStringBeginningToken", + Pattern: `LOGSTART \d+ `, + OmitPattern: true, + Input: []byte("LOGSTART 123 LOGSTART 345 log1\nLOGSTART 234 log2\nLOGSTART 345 foo"), + ExpectedTokens: []string{ + "LOGSTART 345 log1\n", + "log2\n", + }, + }, { Name: "NoMatches", Pattern: `LOGSTART \d+ `, Input: []byte(`file that has no matches in it`), }, + { + + Name: "NoMatchesOmitPattern", + Pattern: `LOGSTART \d+ `, + OmitPattern: true, + Input: []byte(`file that has no matches in it`), + }, { Name: "PrecedingNonMatches", Pattern: `LOGSTART \d+ `, @@ -107,6 +154,16 @@ func TestLineStartSplitFunc(t *testing.T) { `LOGSTART 123 part that matches`, }, }, + { + Name: "PrecedingNonMatchesOmitPattern", + Pattern: `LOGSTART \d+ `, + OmitPattern: true, + Input: []byte(`part that doesn't match LOGSTART 123 part that matchesLOGSTART 123 foo`), + ExpectedTokens: []string{ + `part that doesn't match `, + `part that matches`, + }, + }, { Name: "HugeLog100", Pattern: `LOGSTART \d+ `, @@ -120,6 +177,21 @@ func TestLineStartSplitFunc(t *testing.T) { `LOGSTART 123 ` + string(splittest.GenerateBytes(100)), }, }, + { + Name: "HugeLog100OmitPattern", + Pattern: `LOGSTART \d+ `, + OmitPattern: true, + + Input: func() []byte { + newInput := []byte(`LOGSTART 123 `) + newInput = append(newInput, splittest.GenerateBytes(100)...) + newInput = append(newInput, []byte(`LOGSTART 234 endlog`)...) + return newInput + }(), + ExpectedTokens: []string{ + string(splittest.GenerateBytes(100)), + }, + }, { Name: "HugeLog10000", Pattern: `LOGSTART \d+ `, @@ -144,6 +216,18 @@ func TestLineStartSplitFunc(t *testing.T) { }(), ExpectedError: errors.New("bufio.Scanner: token too long"), }, + { + Name: "ErrTooLongOmitPattern", + Pattern: `LOGSTART \d+ `, + OmitPattern: true, + Input: func() []byte { + newInput := []byte(`LOGSTART 123 `) + newInput = append(newInput, splittest.GenerateBytes(1000000)...) + newInput = append(newInput, []byte(`LOGSTART 234 endlog`)...) + return newInput + }(), + ExpectedError: errors.New("bufio.Scanner: token too long"), + }, { Name: "MultipleMultilineLogs", Pattern: `^LOGSTART \d+`, @@ -153,6 +237,23 @@ func TestLineStartSplitFunc(t *testing.T) { "LOGSTART 17 log2\nLOGPART log2\nanother line\n", }, }, + { + Name: "MultipleMultilineLogsOmitPattern", + Pattern: `^LOGSTART \d+`, + OmitPattern: true, + Input: []byte("LOGSTART 12 log1\t \nLOGPART log1\nLOGPART log1\t \nLOGSTART 17 log2\nLOGPART log2\nanother line\nLOGSTART 43 log5"), + ExpectedTokens: []string{ + " log1\t \nLOGPART log1\nLOGPART log1\t \n", + " log2\nLOGPART log2\nanother line\n", + }, + }, + { + + Name: "LogsWithoutFlusherOmitPattern", + Pattern: `^LOGSTART \d+`, + OmitPattern: true, + Input: []byte("LOGPART log1\nLOGPART log1\t \n"), + }, { Name: "NoMatch", Pattern: `^LOGSTART \d+`, @@ -161,14 +262,14 @@ func TestLineStartSplitFunc(t *testing.T) { } for _, tc := range testCases { - cfg := Config{LineStartPattern: tc.Pattern} + cfg := Config{LineStartPattern: tc.Pattern, OmitPattern: tc.OmitPattern} splitFunc, err := cfg.Func(unicode.UTF8, false, 0) require.NoError(t, err) t.Run(tc.Name, tc.Run(splitFunc)) } t.Run("FirstMatchHitsEndOfBuffer", func(t *testing.T) { - splitFunc := LineStartSplitFunc(regexp.MustCompile("LOGSTART"), false) + splitFunc := LineStartSplitFunc(regexp.MustCompile("LOGSTART"), false, false) data := []byte(`LOGSTART`) t.Run("NotAtEOF", func(t *testing.T) { @@ -255,6 +356,15 @@ func TestLineEndSplitFunc(t *testing.T) { `my log LOGEND 123`, }, }, + { + Name: "OneLogSimpleOmitPattern", + Pattern: `LOGEND \d+`, + OmitPattern: true, + Input: []byte(`my log LOGEND 123`), + ExpectedTokens: []string{ + `my log `, + }, + }, { Name: "TwoLogsSimple", Pattern: `LOGEND \d+`, @@ -264,6 +374,16 @@ func TestLineEndSplitFunc(t *testing.T) { `log2 LOGEND 234`, }, }, + { + Name: "TwoLogsSimpleOmitPattern", + Pattern: `LOGEND \d+`, + OmitPattern: true, + Input: []byte(`log1 LOGEND 123log2 LOGEND 234`), + ExpectedTokens: []string{ + `log1 `, + `log2 `, + }, + }, { Name: "TwoLogsLineEndSimple", Pattern: `LOGEND$`, @@ -273,11 +393,38 @@ func TestLineEndSplitFunc(t *testing.T) { "\nlog2 LOGEND", }, }, + { + Name: "TwoLogsLineEndSimpleOmitPattern", + Pattern: `LOGEND$`, + OmitPattern: true, + Input: []byte("log1 LOGEND LOGEND\nlog2 LOGEND\n"), + ExpectedTokens: []string{ + "log1 LOGEND ", + "\nlog2 ", + }, + }, + { + Name: "TwoLogsLineEndSimpleOmitPatternNoStringEndingToken", + Pattern: `LOGEND`, + OmitPattern: true, + Input: []byte("log1 LOGEND LOGEND\nlog2 LOGEND\n"), + ExpectedTokens: []string{ + "log1 ", + " ", + "\nlog2 ", + }, + }, { Name: "NoMatches", Pattern: `LOGEND \d+`, Input: []byte(`file that has no matches in it`), }, + { + Name: "NoMatchesOmitPattern", + OmitPattern: true, + Pattern: `LOGEND \d+`, + Input: []byte(`file that has no matches in it`), + }, { Name: "NonMatchesAfter", Pattern: `LOGEND \d+`, @@ -286,6 +433,15 @@ func TestLineEndSplitFunc(t *testing.T) { `part that matches LOGEND 123`, }, }, + { + Name: "NonMatchesAfterOmitPattern", + Pattern: `LOGEND \d+`, + OmitPattern: true, + Input: []byte(`part that matches LOGEND 123 part that doesn't match`), + ExpectedTokens: []string{ + `part that matches `, + }, + }, { Name: "HugeLog100", Pattern: `LOGEND \d`, @@ -298,6 +454,19 @@ func TestLineEndSplitFunc(t *testing.T) { string(splittest.GenerateBytes(100)) + `LOGEND 1`, }, }, + { + Name: "HugeLog100OmitPattern", + Pattern: `LOGEND \d`, + OmitPattern: true, + Input: func() []byte { + newInput := splittest.GenerateBytes(100) + newInput = append(newInput, []byte(`LOGEND 1 `)...) + return newInput + }(), + ExpectedTokens: []string{ + string(splittest.GenerateBytes(100)), + }, + }, { Name: "HugeLog10000", Pattern: `LOGEND \d`, @@ -320,6 +489,17 @@ func TestLineEndSplitFunc(t *testing.T) { }(), ExpectedError: errors.New("bufio.Scanner: token too long"), }, + { + Name: "HugeLog1000000OmitPattern", + Pattern: `LOGEND \d`, + OmitPattern: true, + Input: func() []byte { + newInput := splittest.GenerateBytes(1000000) + newInput = append(newInput, []byte(`LOGEND 1 `)...) + return newInput + }(), + ExpectedError: errors.New("bufio.Scanner: token too long"), + }, { Name: "MultiplesplitLogs", Pattern: `^LOGEND.*$`, @@ -329,6 +509,16 @@ func TestLineEndSplitFunc(t *testing.T) { "\nLOGSTART 17 log2\nLOGPART log2\nLOGEND log2", }, }, + { + Name: "MultipleMultilineLogsOmitPattern", + Pattern: `^LOGEND.*$`, + OmitPattern: true, + Input: []byte("LOGSTART 12 log1\t \nLOGPART log1\nLOGEND log1\t \nLOGSTART 17 log2\nLOGPART log2\nLOGEND log2\nLOGSTART 43 log5"), + ExpectedTokens: []string{ + "LOGSTART 12 log1\t \nLOGPART log1\n", + "\nLOGSTART 17 log2\nLOGPART log2\n", + }, + }, { Name: "NoMatch", Pattern: `^LOGEND.*$`, @@ -337,7 +527,7 @@ func TestLineEndSplitFunc(t *testing.T) { } for _, tc := range testCases { - cfg := Config{LineEndPattern: tc.Pattern} + cfg := Config{LineEndPattern: tc.Pattern, OmitPattern: tc.OmitPattern} splitFunc, err := cfg.Func(unicode.UTF8, false, 0) require.NoError(t, err) t.Run(tc.Name, tc.Run(splitFunc)) diff --git a/pkg/stanza/split/splittest/splittest.go b/pkg/stanza/split/splittest/splittest.go index b784b1b68dec..643d6db0a59c 100644 --- a/pkg/stanza/split/splittest/splittest.go +++ b/pkg/stanza/split/splittest/splittest.go @@ -73,6 +73,7 @@ func (r *testReader) splitFunc(split bufio.SplitFunc) bufio.SplitFunc { type TestCase struct { Name string Pattern string + OmitPattern bool Input []byte ExpectedTokens []string ExpectedError error diff --git a/receiver/filelogreceiver/README.md b/receiver/filelogreceiver/README.md index 70ccdc1e6a17..f4f730876ef2 100644 --- a/receiver/filelogreceiver/README.md +++ b/receiver/filelogreceiver/README.md @@ -74,6 +74,8 @@ If set, the `multiline` configuration block instructs the `file_input` operator The `multiline` configuration block must contain exactly one of `line_start_pattern` or `line_end_pattern`. These are regex patterns that match either the beginning of a new log entry, or the end of a log entry. +The `omit_pattern` setting can be used to omit the start/end pattern from each entry. + ### Supported encodings | Key | Description diff --git a/receiver/tcplogreceiver/README.md b/receiver/tcplogreceiver/README.md index d1e7a70b6d46..359a32764691 100644 --- a/receiver/tcplogreceiver/README.md +++ b/receiver/tcplogreceiver/README.md @@ -64,6 +64,8 @@ If set, the `multiline` configuration block instructs the `tcplog` receiver to s The `multiline` configuration block must contain exactly one of `line_start_pattern` or `line_end_pattern`. These are regex patterns that match either the beginning of a new log entry, or the end of a log entry. +The `omit_pattern` setting can be used to omit the start/end pattern from each entry. + #### Supported encodings | Key | Description diff --git a/receiver/udplogreceiver/README.md b/receiver/udplogreceiver/README.md index e5b0041d956d..cfe8d7ee2d00 100644 --- a/receiver/udplogreceiver/README.md +++ b/receiver/udplogreceiver/README.md @@ -52,6 +52,8 @@ If set, the `multiline` configuration block instructs the `udplog` receiver to s The `multiline` configuration block must contain exactly one of `line_start_pattern` or `line_end_pattern`. These are regex patterns that match either the beginning of a new log entry, or the end of a log entry. +The `omit_pattern` setting can be used to omit the start/end pattern from each entry. + ### Supported encodings | Key | Description