From af149f80b477603bb806eaa723e4155270e1b272 Mon Sep 17 00:00:00 2001 From: Andrea Spacca Date: Thu, 12 Aug 2021 10:31:30 +0200 Subject: [PATCH 01/20] [Filebeat] Add option for S3 input to work without SQS notification --- CHANGELOG.next.asciidoc | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.next.asciidoc b/CHANGELOG.next.asciidoc index 0c5be3c6ba3c..e3598a169c54 100644 --- a/CHANGELOG.next.asciidoc +++ b/CHANGELOG.next.asciidoc @@ -80,6 +80,7 @@ https://github.com/elastic/beats/compare/v7.0.0-alpha2...master[Check the HEAD d - Fix parsing of Elasticsearch node name by `elasticsearch/slowlog` fileset. {pull}14547[14547] - threatintel module: Changed the type of `threatintel.indicator.first_seen` from `keyword` to `date`. {pull}26765[26765] - Remove all alias fields pointing to ECS fields from modules. This affects the Suricata and Traefik modules. {issue}10535[10535] {pull}26627[26627] +- Add option for S3 input to work without SQS notification {issue}18205[18205] {pull}NNNN[NNNN] *Heartbeat* - Remove long deprecated `watch_poll` functionality. {pull}27166[27166] From 4c43eabaccb8024b9bd6f19e8bbe678157946834 Mon Sep 17 00:00:00 2001 From: Andrea Spacca Date: Thu, 12 Aug 2021 10:35:24 +0200 Subject: [PATCH 02/20] changelog --- CHANGELOG.next.asciidoc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.next.asciidoc b/CHANGELOG.next.asciidoc index e3598a169c54..0b30e121c4fe 100644 --- a/CHANGELOG.next.asciidoc +++ b/CHANGELOG.next.asciidoc @@ -80,7 +80,7 @@ https://github.com/elastic/beats/compare/v7.0.0-alpha2...master[Check the HEAD d - Fix parsing of Elasticsearch node name by `elasticsearch/slowlog` fileset. {pull}14547[14547] - threatintel module: Changed the type of `threatintel.indicator.first_seen` from `keyword` to `date`. {pull}26765[26765] - Remove all alias fields pointing to ECS fields from modules. This affects the Suricata and Traefik modules. {issue}10535[10535] {pull}26627[26627] -- Add option for S3 input to work without SQS notification {issue}18205[18205] {pull}NNNN[NNNN] +- Add option for S3 input to work without SQS notification {issue}18205[18205] {pull}27332[27332] *Heartbeat* - Remove long deprecated `watch_poll` functionality. {pull}27166[27166] From 23d848828e7bb8086b162b84c00cdc0a1f1b1659 Mon Sep 17 00:00:00 2001 From: Andrea Spacca Date: Thu, 12 Aug 2021 11:18:48 +0200 Subject: [PATCH 03/20] docs and configs --- .../docs/inputs/input-aws-s3.asciidoc | 78 +++++++++++++++++-- x-pack/filebeat/filebeat.reference.yml | 25 ++++-- .../filebeat/module/aws/_meta/docs.asciidoc | 38 ++++++++- .../module/aws/cloudtrail/config/aws-s3.yml | 13 ++++ .../module/aws/cloudtrail/manifest.yml | 3 + .../module/aws/cloudwatch/config/aws-s3.yml | 13 ++++ .../module/aws/cloudwatch/manifest.yml | 3 + .../filebeat/module/aws/ec2/config/aws-s3.yml | 13 ++++ x-pack/filebeat/module/aws/ec2/manifest.yml | 3 + .../filebeat/module/aws/elb/config/aws-s3.yml | 13 ++++ x-pack/filebeat/module/aws/elb/manifest.yml | 3 + .../module/aws/s3access/config/aws-s3.yml | 13 ++++ .../filebeat/module/aws/s3access/manifest.yml | 3 + .../module/aws/vpcflow/config/input.yml | 13 ++++ .../filebeat/module/aws/vpcflow/manifest.yml | 3 + 15 files changed, 217 insertions(+), 20 deletions(-) diff --git a/x-pack/filebeat/docs/inputs/input-aws-s3.asciidoc b/x-pack/filebeat/docs/inputs/input-aws-s3.asciidoc index a302d0366b4a..1893f6335291 100644 --- a/x-pack/filebeat/docs/inputs/input-aws-s3.asciidoc +++ b/x-pack/filebeat/docs/inputs/input-aws-s3.asciidoc @@ -12,12 +12,19 @@ ++++ Use the `aws-s3` input to retrieve logs from S3 objects that are pointed to by -S3 notification events read from an SQS queue. This input can, for example, be +S3 notification events read from an SQS queue or directly polling list of S3 objects in an S3 bucket. +The use of SQS notification is preferred: polling list of S3 objects is expensive +in terms of performance and costs and should be preferably used only when no SQS +notification can be attached to the S3 buckets. This input can, for example, be used to receive S3 access logs to monitor detailed records for the requests that are made to a bucket. -This input depends on S3 notifications delivered to an SQS queue for -`s3:ObjectCreated:*` events. You must create an SQS queue and configure S3 +SQS notification method is enabled setting `queue_url` configuration value. +S3 bucket list polling method is enabled setting `s3_bucket` configuration value. +Both value cannot be set at the same time, at least one of the two value must be set. + +When using the SQS notification method this input depends on S3 notifications delivered +to an SQS queue for `s3:ObjectCreated:*` events. You must create an SQS queue and configure S3 to publish events to the queue. When processing a S3 object which pointed by a SQS message, if half of the set @@ -36,6 +43,24 @@ be stopped and the SQS message will be returned back to the queue. expand_event_list_from_field: Records ---- + +When using the direct polling list of S3 objects in an S3 buckets, +a number of workers that will process the S3 objects listed must be set +through the `s3_bucket_number_of_workers` config. +Listing of the S3 bucket will be polled according the time interval defined by +`s3_bucket_poll_interval` config. Default value is 120secs. + +["source","yaml",subs="attributes"] +---- +{beatname_lc}.inputs: +- type: aws-s3 + s3_bucket: arn:aws:s3:::test-s3-bucket + s3_bucket_number_of_workers: 5 + s3_bucket_poll_interval: 300s + credential_profile_name: elastic-beats + expand_event_list_from_field: Records +---- + The `aws-s3` input supports the following configuration options plus the <<{beatname_lc}-input-{type}-common-options>> described later. @@ -211,7 +236,7 @@ configuring multiline options. [float] ==== `queue_url` -URL of the AWS SQS queue that messages will be received from. Required. +URL of the AWS SQS queue that messages will be received from. (Required when `s3_bucket` is not set). [float] ==== `visibility_timeout` @@ -242,6 +267,24 @@ The maximum duration that an SQS `ReceiveMessage` call should wait for a message to arrive in the queue before returning. The default value is `20s`. The maximum value is `20s`. +[float] +==== `s3_bucket` + +ARN of the AWS S3 bucket that will be polled for list operation. (Required when `queue_url` is not set). + +[float] +==== `s3_bucket_poll_interval` + +Time interval for polling listing of the S3 bucket: default to `120s`. + + +[float] +==== `s3_bucket_number_of_workers` + +Number of workers that will process the S3 objects listed. (Required when `s3_bucket` is set). + + + [float] ==== `aws credentials` @@ -251,7 +294,8 @@ see <> for more details. [float] === AWS Permissions -Specific AWS permissions are required for IAM user to access SQS and S3: +Specific AWS permissions are required for IAM user to access SQS and S3 +when using the SQS notifications method: ---- s3:GetObject @@ -260,6 +304,14 @@ sqs:ChangeMessageVisibility sqs:DeleteMessage ---- +Reduced specific AWS permissions are required for IAM user to access S3 +when using the polling list of S3 bucket objects: + +---- +s3:GetObject +s3:ListBucket +---- + [float] === S3 and SQS setup @@ -271,7 +323,7 @@ for more details. [float] === Parallel Processing -Multiple Filebeat instances can read from the same SQS queues at the same time. +When using the SQS notifications method, multiple {beatname_uc} instances can read from the same SQS queues at the same time. To horizontally scale processing when there are large amounts of log data flowing into an S3 bucket, you can run multiple {beatname_uc} instances that read from the same SQS queues at the same time. No additional configuration is @@ -282,8 +334,8 @@ when multiple {beatname_uc} instances are running in parallel. To prevent {beatname_uc} from receiving and processing the message more than once, set the visibility timeout. -The visibility timeout begins when SQS returns a message to Filebeat. During -this time, Filebeat processes and deletes the message. However, if Filebeat +The visibility timeout begins when SQS returns a message to {beatname_uc}. During +this time, {beatname_uc} processes and deletes the message. However, if {beatname_uc} fails before deleting the message and your system doesn't call the DeleteMessage action for that message before the visibility timeout expires, the message becomes visible to other {beatname_uc} instances, and the message is received @@ -291,6 +343,16 @@ again. By default, the visibility timeout is set to 5 minutes for aws-s3 input in {beatname_uc}. 5 minutes is sufficient time for {beatname_uc} to read SQS messages and process related s3 log files. +When using the polling list of S3 bucket objects method be aware that if running multiple {beatname_uc} instances, +they can list the same S3 bucket at the same time. Since the state of the ingested S3 objects is persisted +(upon processing every page of the listing operation) in the `path.data` configuration +and multiple {beatname_uc} cannot share the same `path.data` this will produce repeated +ingestion of the S3 object. +Therefore, when using the polling list of S3 bucket objects method, scaling should be +vertical, with a single bigger {beatname_uc} instance and higher `s3_bucket_number_of_workers` +config value. + + [float] === Metrics diff --git a/x-pack/filebeat/filebeat.reference.yml b/x-pack/filebeat/filebeat.reference.yml index a0e990f74fe2..66a3bc1993a8 100644 --- a/x-pack/filebeat/filebeat.reference.yml +++ b/x-pack/filebeat/filebeat.reference.yml @@ -102,6 +102,15 @@ filebeat.modules: # AWS SQS queue url #var.queue_url: https://sqs.myregion.amazonaws.com/123456/myqueue + # AWS S3 bucket + #var.s3_bucket: 'arn:aws:s3:::mybucket + + # AWS S3 bucket poll interval + #var.s3_bucket_poll_interval: 300s + + # AWS S3 bucket number of workers + #var.s3_bucket_number_of_workers: 5 + # Process CloudTrail logs # default is true, set to false to skip Cloudtrail logs # var.process_cloudtrail_logs: false @@ -751,7 +760,7 @@ filebeat.modules: #------------------------------- Coredns Module ------------------------------- - module: coredns # Fileset for native deployment - log: + log: enabled: true # Set custom paths for the log files. If left empty, @@ -760,7 +769,7 @@ filebeat.modules: #----------------------------- Crowdstrike Module ----------------------------- - module: crowdstrike - + falcon: enabled: true @@ -875,7 +884,7 @@ filebeat.modules: #------------------------------ Envoyproxy Module ------------------------------ - module: envoyproxy # Fileset for native deployment - log: + log: enabled: true # Set custom paths for the log files. If left empty, @@ -1513,7 +1522,7 @@ filebeat.modules: # Oauth Token URL, should include the tenant ID #var.oauth2.token_url: "https://login.microsoftonline.com/TENANT-ID/oauth2/v2.0/token" - + # Related scopes, default should be included #var.oauth2.scopes: # - "https://api.security.microsoft.com/.default" @@ -2789,7 +2798,7 @@ filebeat.inputs: #rotation.external.strategy.copytruncate: # Regex that matches the rotated files. # suffix_regex: \.\d$ - # If the rotated filename suffix is a datetime, set it here. + # If the rotated filename suffix is a datetime, set it here. # dateformat: -20060102 ### State options @@ -3305,14 +3314,14 @@ filebeat.inputs: # Maximum duration after which events are available to the outputs, # if the number of events stored in the queue is < `flush.min_events`. #flush.timeout: 1s - + # The disk queue stores incoming events on disk until the output is # ready for them. This allows a higher event limit than the memory-only # queue and lets pending events persist through a restart. #disk: # The directory path to store the queue's data. #path: "${path.data}/diskqueue" - + # The maximum space the queue should occupy on disk. Depending on # input settings, events that exceed this limit are delayed or discarded. #max_size: 10GB @@ -4248,7 +4257,7 @@ output.elasticsearch: # Permissions to use for file creation. The default is 0600. #permissions: 0600 - + # Configure automatic file rotation on every startup. The default is true. #rotate_on_startup: true # ------------------------------- Console Output ------------------------------- diff --git a/x-pack/filebeat/module/aws/_meta/docs.asciidoc b/x-pack/filebeat/module/aws/_meta/docs.asciidoc index 4cd9482486a1..a3bb06ee12f0 100644 --- a/x-pack/filebeat/module/aws/_meta/docs.asciidoc +++ b/x-pack/filebeat/module/aws/_meta/docs.asciidoc @@ -10,9 +10,14 @@ beta[] This is a module for aws logs. It uses filebeat s3 input to get log files from -AWS S3 buckets with SQS notification. This module supports reading s3 server -access logs with `s3access` fileset, ELB access logs with `elb` fileset, VPC -flow logs with `vpcflow` fileset, and CloudTrail logs with `cloudtrail` fileset. +AWS S3 buckets with SQS notification or directly polling list of S3 objects in an S3 bucket. +The use of SQS notification is preferred: polling list of S3 objects is expensive +in terms of performance and costs, and cannot scale horizontally without ingestion duplication, +and should be preferably used only when no SQS notification can be attached to the S3 buckets. + +This module supports reading S3 server access logs with `s3access` fileset, +ELB access logs with `elb` fileset, VPC flow logs with `vpcflow` fileset, +and CloudTrail logs with `cloudtrail` fileset. Access logs contain detailed information about the requests made to these services. VPC flow logs captures information about the IP traffic going to and @@ -39,6 +44,8 @@ Example config: cloudtrail: enabled: false #var.queue_url: https://sqs.myregion.amazonaws.com/123456/myqueue + #var.s3_bucket: 'arn:aws:s3:::mybucket + #var.s3_bucket_poll_interval: 300s #var.shared_credential_file: /etc/filebeat/aws_credentials #var.credential_profile_name: fb-aws #var.access_key_id: access_key_id @@ -53,6 +60,8 @@ Example config: cloudwatch: enabled: false #var.queue_url: https://sqs.myregion.amazonaws.com/123456/myqueue + #var.s3_bucket: 'arn:aws:s3:::mybucket + #var.s3_bucket_poll_interval: 300s #var.shared_credential_file: /etc/filebeat/aws_credentials #var.credential_profile_name: fb-aws #var.access_key_id: access_key_id @@ -67,6 +76,8 @@ Example config: ec2: enabled: false #var.queue_url: https://sqs.myregion.amazonaws.com/123456/myqueue + #var.s3_bucket: 'arn:aws:s3:::mybucket + #var.s3_bucket_poll_interval: 300s #var.shared_credential_file: /etc/filebeat/aws_credentials #var.credential_profile_name: fb-aws #var.access_key_id: access_key_id @@ -81,6 +92,8 @@ Example config: elb: enabled: false #var.queue_url: https://sqs.myregion.amazonaws.com/123456/myqueue + #var.s3_bucket: 'arn:aws:s3:::mybucket + #var.s3_bucket_poll_interval: 300s #var.shared_credential_file: /etc/filebeat/aws_credentials #var.credential_profile_name: fb-aws #var.access_key_id: access_key_id @@ -95,6 +108,8 @@ Example config: s3access: enabled: false #var.queue_url: https://sqs.myregion.amazonaws.com/123456/myqueue + #var.s3_bucket: 'arn:aws:s3:::mybucket + #var.s3_bucket_poll_interval: 300s #var.shared_credential_file: /etc/filebeat/aws_credentials #var.credential_profile_name: fb-aws #var.access_key_id: access_key_id @@ -109,6 +124,8 @@ Example config: vpcflow: enabled: false #var.queue_url: https://sqs.myregion.amazonaws.com/123456/myqueue + #var.s3_bucket: 'arn:aws:s3:::mybucket + #var.s3_bucket_poll_interval: 300s #var.shared_credential_file: /etc/filebeat/aws_credentials #var.credential_profile_name: fb-aws #var.access_key_id: access_key_id @@ -123,7 +140,7 @@ Example config: *`var.queue_url`*:: -(Required) AWS SQS queue url. +AWS SQS queue url (Required when `var.s3_bucket` is not set). *`var.visibility_timeout`*:: @@ -134,6 +151,19 @@ Default to be 300 seconds. Maximum duration before AWS API request will be interrupted. Default to be 120 seconds. +*`var.s3_bucket`*:: + +AWS S3 bucket ARN (Required when `var.queue_url` is not set). + +*`var.s3_bucket_number_of_workers`*:: + +Number of workers that will process the S3 objects listed (Required when `var.s3_bucket` is set). +Use to vertically scale the input. + +*`var.s3_bucket_poll_interval`*:: + +Interval between list requests to the S3 bucket. Default to be 120 seconds. + *`var.endpoint`*:: Custom endpoint used to access AWS APIs. diff --git a/x-pack/filebeat/module/aws/cloudtrail/config/aws-s3.yml b/x-pack/filebeat/module/aws/cloudtrail/config/aws-s3.yml index 4daf262994bf..17dd64a99fc5 100644 --- a/x-pack/filebeat/module/aws/cloudtrail/config/aws-s3.yml +++ b/x-pack/filebeat/module/aws/cloudtrail/config/aws-s3.yml @@ -1,5 +1,18 @@ type: aws-s3 +{{ if .queue_url }} queue_url: {{ .queue_url }} +{{ end }} +{{ if .s3_bucket }} +s3_bucket: {{ .s3_bucket }} +{{ end }} + +{{ if .s3_bucket_number_of_workers }} +s3_bucket_number_of_workers: {{ .s3_bucket_number_of_workers }} +{{ end }} + +{{ if .s3_bucket_poll_interval }} +s3_bucket_poll_interval: {{ .s3_bucket_poll_interval }} +{{ end }} file_selectors: {{ if .process_cloudtrail_logs }} - regex: '/CloudTrail/' diff --git a/x-pack/filebeat/module/aws/cloudtrail/manifest.yml b/x-pack/filebeat/module/aws/cloudtrail/manifest.yml index 1903ee34f251..d06d1b0b6774 100644 --- a/x-pack/filebeat/module/aws/cloudtrail/manifest.yml +++ b/x-pack/filebeat/module/aws/cloudtrail/manifest.yml @@ -4,6 +4,9 @@ var: - name: input default: aws-s3 - name: queue_url + - name: s3_bucket + - name: s3_bucket_number_of_workers + - name: s3_bucket_poll_interval - name: shared_credential_file - name: credential_profile_name - name: visibility_timeout diff --git a/x-pack/filebeat/module/aws/cloudwatch/config/aws-s3.yml b/x-pack/filebeat/module/aws/cloudwatch/config/aws-s3.yml index b0fb5feed0c5..2f7a694382aa 100644 --- a/x-pack/filebeat/module/aws/cloudwatch/config/aws-s3.yml +++ b/x-pack/filebeat/module/aws/cloudwatch/config/aws-s3.yml @@ -1,5 +1,18 @@ type: aws-s3 +{{ if .queue_url }} queue_url: {{ .queue_url }} +{{ end }} +{{ if .s3_bucket }} +s3_bucket: {{ .s3_bucket }} +{{ end }} + +{{ if .s3_bucket_number_of_workers }} +s3_bucket_number_of_workers: {{ .s3_bucket_number_of_workers }} +{{ end }} + +{{ if .s3_bucket_poll_interval }} +s3_bucket_poll_interval: {{ .s3_bucket_poll_interval }} +{{ end }} {{ if .credential_profile_name }} credential_profile_name: {{ .credential_profile_name }} diff --git a/x-pack/filebeat/module/aws/cloudwatch/manifest.yml b/x-pack/filebeat/module/aws/cloudwatch/manifest.yml index 84f672107c63..275cae461146 100644 --- a/x-pack/filebeat/module/aws/cloudwatch/manifest.yml +++ b/x-pack/filebeat/module/aws/cloudwatch/manifest.yml @@ -4,6 +4,9 @@ var: - name: input default: aws-s3 - name: queue_url + - name: s3_bucket + - name: s3_bucket_number_of_workers + - name: s3_bucket_poll_interval - name: shared_credential_file - name: credential_profile_name - name: visibility_timeout diff --git a/x-pack/filebeat/module/aws/ec2/config/aws-s3.yml b/x-pack/filebeat/module/aws/ec2/config/aws-s3.yml index b0fb5feed0c5..2f7a694382aa 100644 --- a/x-pack/filebeat/module/aws/ec2/config/aws-s3.yml +++ b/x-pack/filebeat/module/aws/ec2/config/aws-s3.yml @@ -1,5 +1,18 @@ type: aws-s3 +{{ if .queue_url }} queue_url: {{ .queue_url }} +{{ end }} +{{ if .s3_bucket }} +s3_bucket: {{ .s3_bucket }} +{{ end }} + +{{ if .s3_bucket_number_of_workers }} +s3_bucket_number_of_workers: {{ .s3_bucket_number_of_workers }} +{{ end }} + +{{ if .s3_bucket_poll_interval }} +s3_bucket_poll_interval: {{ .s3_bucket_poll_interval }} +{{ end }} {{ if .credential_profile_name }} credential_profile_name: {{ .credential_profile_name }} diff --git a/x-pack/filebeat/module/aws/ec2/manifest.yml b/x-pack/filebeat/module/aws/ec2/manifest.yml index 84f672107c63..275cae461146 100644 --- a/x-pack/filebeat/module/aws/ec2/manifest.yml +++ b/x-pack/filebeat/module/aws/ec2/manifest.yml @@ -4,6 +4,9 @@ var: - name: input default: aws-s3 - name: queue_url + - name: s3_bucket + - name: s3_bucket_number_of_workers + - name: s3_bucket_poll_interval - name: shared_credential_file - name: credential_profile_name - name: visibility_timeout diff --git a/x-pack/filebeat/module/aws/elb/config/aws-s3.yml b/x-pack/filebeat/module/aws/elb/config/aws-s3.yml index b0fb5feed0c5..2f7a694382aa 100644 --- a/x-pack/filebeat/module/aws/elb/config/aws-s3.yml +++ b/x-pack/filebeat/module/aws/elb/config/aws-s3.yml @@ -1,5 +1,18 @@ type: aws-s3 +{{ if .queue_url }} queue_url: {{ .queue_url }} +{{ end }} +{{ if .s3_bucket }} +s3_bucket: {{ .s3_bucket }} +{{ end }} + +{{ if .s3_bucket_number_of_workers }} +s3_bucket_number_of_workers: {{ .s3_bucket_number_of_workers }} +{{ end }} + +{{ if .s3_bucket_poll_interval }} +s3_bucket_poll_interval: {{ .s3_bucket_poll_interval }} +{{ end }} {{ if .credential_profile_name }} credential_profile_name: {{ .credential_profile_name }} diff --git a/x-pack/filebeat/module/aws/elb/manifest.yml b/x-pack/filebeat/module/aws/elb/manifest.yml index 735591632349..2392b96ecf71 100644 --- a/x-pack/filebeat/module/aws/elb/manifest.yml +++ b/x-pack/filebeat/module/aws/elb/manifest.yml @@ -4,6 +4,9 @@ var: - name: input default: aws-s3 - name: queue_url + - name: s3_bucket + - name: s3_bucket_number_of_workers + - name: s3_bucket_poll_interval - name: shared_credential_file - name: credential_profile_name - name: visibility_timeout diff --git a/x-pack/filebeat/module/aws/s3access/config/aws-s3.yml b/x-pack/filebeat/module/aws/s3access/config/aws-s3.yml index b0fb5feed0c5..2f7a694382aa 100644 --- a/x-pack/filebeat/module/aws/s3access/config/aws-s3.yml +++ b/x-pack/filebeat/module/aws/s3access/config/aws-s3.yml @@ -1,5 +1,18 @@ type: aws-s3 +{{ if .queue_url }} queue_url: {{ .queue_url }} +{{ end }} +{{ if .s3_bucket }} +s3_bucket: {{ .s3_bucket }} +{{ end }} + +{{ if .s3_bucket_number_of_workers }} +s3_bucket_number_of_workers: {{ .s3_bucket_number_of_workers }} +{{ end }} + +{{ if .s3_bucket_poll_interval }} +s3_bucket_poll_interval: {{ .s3_bucket_poll_interval }} +{{ end }} {{ if .credential_profile_name }} credential_profile_name: {{ .credential_profile_name }} diff --git a/x-pack/filebeat/module/aws/s3access/manifest.yml b/x-pack/filebeat/module/aws/s3access/manifest.yml index 84f672107c63..275cae461146 100644 --- a/x-pack/filebeat/module/aws/s3access/manifest.yml +++ b/x-pack/filebeat/module/aws/s3access/manifest.yml @@ -4,6 +4,9 @@ var: - name: input default: aws-s3 - name: queue_url + - name: s3_bucket + - name: s3_bucket_number_of_workers + - name: s3_bucket_poll_interval - name: shared_credential_file - name: credential_profile_name - name: visibility_timeout diff --git a/x-pack/filebeat/module/aws/vpcflow/config/input.yml b/x-pack/filebeat/module/aws/vpcflow/config/input.yml index 8fb86aee8725..5a488669b9a8 100644 --- a/x-pack/filebeat/module/aws/vpcflow/config/input.yml +++ b/x-pack/filebeat/module/aws/vpcflow/config/input.yml @@ -1,7 +1,20 @@ {{ if eq .input "aws-s3" }} type: aws-s3 +{{ if .queue_url }} queue_url: {{ .queue_url }} +{{ end }} +{{ if .s3_bucket }} +s3_bucket: {{ .s3_bucket }} +{{ end }} + +{{ if .s3_bucket_number_of_workers }} +s3_bucket_number_of_workers: {{ .s3_bucket_number_of_workers }} +{{ end }} + +{{ if .s3_bucket_poll_interval }} +s3_bucket_poll_interval: {{ .s3_bucket_poll_interval }} +{{ end }} {{ if .credential_profile_name }} credential_profile_name: {{ .credential_profile_name }} diff --git a/x-pack/filebeat/module/aws/vpcflow/manifest.yml b/x-pack/filebeat/module/aws/vpcflow/manifest.yml index 0c2ec0f7e1b4..bcfeb3132858 100644 --- a/x-pack/filebeat/module/aws/vpcflow/manifest.yml +++ b/x-pack/filebeat/module/aws/vpcflow/manifest.yml @@ -4,6 +4,9 @@ var: - name: input default: aws-s3 - name: queue_url + - name: s3_bucket + - name: s3_bucket_number_of_workers + - name: s3_bucket_poll_interval - name: shared_credential_file - name: credential_profile_name - name: visibility_timeout From 4b91953aff6a32aaea4a50abe66147cb8ee59cb4 Mon Sep 17 00:00:00 2001 From: Andrea Spacca Date: Thu, 12 Aug 2021 11:26:50 +0200 Subject: [PATCH 04/20] make update --- filebeat/docs/modules/aws.asciidoc | 38 +++++++++++++++++++++++--- x-pack/filebeat/filebeat.reference.yml | 25 ++++++----------- 2 files changed, 42 insertions(+), 21 deletions(-) diff --git a/filebeat/docs/modules/aws.asciidoc b/filebeat/docs/modules/aws.asciidoc index bb8269933a4c..8ac1a47dfb30 100644 --- a/filebeat/docs/modules/aws.asciidoc +++ b/filebeat/docs/modules/aws.asciidoc @@ -15,9 +15,14 @@ This file is generated! See scripts/docs_collector.py beta[] This is a module for aws logs. It uses filebeat s3 input to get log files from -AWS S3 buckets with SQS notification. This module supports reading s3 server -access logs with `s3access` fileset, ELB access logs with `elb` fileset, VPC -flow logs with `vpcflow` fileset, and CloudTrail logs with `cloudtrail` fileset. +AWS S3 buckets with SQS notification or directly polling list of S3 objects in an S3 bucket. +The use of SQS notification is preferred: polling list of S3 objects is expensive +in terms of performance and costs, and cannot scale horizontally without ingestion duplication, +and should be preferably used only when no SQS notification can be attached to the S3 buckets. + +This module supports reading S3 server access logs with `s3access` fileset, +ELB access logs with `elb` fileset, VPC flow logs with `vpcflow` fileset, +and CloudTrail logs with `cloudtrail` fileset. Access logs contain detailed information about the requests made to these services. VPC flow logs captures information about the IP traffic going to and @@ -44,6 +49,8 @@ Example config: cloudtrail: enabled: false #var.queue_url: https://sqs.myregion.amazonaws.com/123456/myqueue + #var.s3_bucket: 'arn:aws:s3:::mybucket + #var.s3_bucket_poll_interval: 300s #var.shared_credential_file: /etc/filebeat/aws_credentials #var.credential_profile_name: fb-aws #var.access_key_id: access_key_id @@ -58,6 +65,8 @@ Example config: cloudwatch: enabled: false #var.queue_url: https://sqs.myregion.amazonaws.com/123456/myqueue + #var.s3_bucket: 'arn:aws:s3:::mybucket + #var.s3_bucket_poll_interval: 300s #var.shared_credential_file: /etc/filebeat/aws_credentials #var.credential_profile_name: fb-aws #var.access_key_id: access_key_id @@ -72,6 +81,8 @@ Example config: ec2: enabled: false #var.queue_url: https://sqs.myregion.amazonaws.com/123456/myqueue + #var.s3_bucket: 'arn:aws:s3:::mybucket + #var.s3_bucket_poll_interval: 300s #var.shared_credential_file: /etc/filebeat/aws_credentials #var.credential_profile_name: fb-aws #var.access_key_id: access_key_id @@ -86,6 +97,8 @@ Example config: elb: enabled: false #var.queue_url: https://sqs.myregion.amazonaws.com/123456/myqueue + #var.s3_bucket: 'arn:aws:s3:::mybucket + #var.s3_bucket_poll_interval: 300s #var.shared_credential_file: /etc/filebeat/aws_credentials #var.credential_profile_name: fb-aws #var.access_key_id: access_key_id @@ -100,6 +113,8 @@ Example config: s3access: enabled: false #var.queue_url: https://sqs.myregion.amazonaws.com/123456/myqueue + #var.s3_bucket: 'arn:aws:s3:::mybucket + #var.s3_bucket_poll_interval: 300s #var.shared_credential_file: /etc/filebeat/aws_credentials #var.credential_profile_name: fb-aws #var.access_key_id: access_key_id @@ -114,6 +129,8 @@ Example config: vpcflow: enabled: false #var.queue_url: https://sqs.myregion.amazonaws.com/123456/myqueue + #var.s3_bucket: 'arn:aws:s3:::mybucket + #var.s3_bucket_poll_interval: 300s #var.shared_credential_file: /etc/filebeat/aws_credentials #var.credential_profile_name: fb-aws #var.access_key_id: access_key_id @@ -128,7 +145,7 @@ Example config: *`var.queue_url`*:: -(Required) AWS SQS queue url. +AWS SQS queue url (Required when `var.s3_bucket` is not set). *`var.visibility_timeout`*:: @@ -139,6 +156,19 @@ Default to be 300 seconds. Maximum duration before AWS API request will be interrupted. Default to be 120 seconds. +*`var.s3_bucket`*:: + +AWS S3 bucket ARN (Required when `var.queue_url` is not set). + +*`var.s3_bucket_number_of_workers`*:: + +Number of workers that will process the S3 objects listed (Required when `var.s3_bucket` is set). +Use to vertically scale the input. + +*`var.s3_bucket_poll_interval`*:: + +Interval between list requests to the S3 bucket. Default to be 120 seconds. + *`var.endpoint`*:: Custom endpoint used to access AWS APIs. diff --git a/x-pack/filebeat/filebeat.reference.yml b/x-pack/filebeat/filebeat.reference.yml index 66a3bc1993a8..a0e990f74fe2 100644 --- a/x-pack/filebeat/filebeat.reference.yml +++ b/x-pack/filebeat/filebeat.reference.yml @@ -102,15 +102,6 @@ filebeat.modules: # AWS SQS queue url #var.queue_url: https://sqs.myregion.amazonaws.com/123456/myqueue - # AWS S3 bucket - #var.s3_bucket: 'arn:aws:s3:::mybucket - - # AWS S3 bucket poll interval - #var.s3_bucket_poll_interval: 300s - - # AWS S3 bucket number of workers - #var.s3_bucket_number_of_workers: 5 - # Process CloudTrail logs # default is true, set to false to skip Cloudtrail logs # var.process_cloudtrail_logs: false @@ -760,7 +751,7 @@ filebeat.modules: #------------------------------- Coredns Module ------------------------------- - module: coredns # Fileset for native deployment - log: + log: enabled: true # Set custom paths for the log files. If left empty, @@ -769,7 +760,7 @@ filebeat.modules: #----------------------------- Crowdstrike Module ----------------------------- - module: crowdstrike - + falcon: enabled: true @@ -884,7 +875,7 @@ filebeat.modules: #------------------------------ Envoyproxy Module ------------------------------ - module: envoyproxy # Fileset for native deployment - log: + log: enabled: true # Set custom paths for the log files. If left empty, @@ -1522,7 +1513,7 @@ filebeat.modules: # Oauth Token URL, should include the tenant ID #var.oauth2.token_url: "https://login.microsoftonline.com/TENANT-ID/oauth2/v2.0/token" - + # Related scopes, default should be included #var.oauth2.scopes: # - "https://api.security.microsoft.com/.default" @@ -2798,7 +2789,7 @@ filebeat.inputs: #rotation.external.strategy.copytruncate: # Regex that matches the rotated files. # suffix_regex: \.\d$ - # If the rotated filename suffix is a datetime, set it here. + # If the rotated filename suffix is a datetime, set it here. # dateformat: -20060102 ### State options @@ -3314,14 +3305,14 @@ filebeat.inputs: # Maximum duration after which events are available to the outputs, # if the number of events stored in the queue is < `flush.min_events`. #flush.timeout: 1s - + # The disk queue stores incoming events on disk until the output is # ready for them. This allows a higher event limit than the memory-only # queue and lets pending events persist through a restart. #disk: # The directory path to store the queue's data. #path: "${path.data}/diskqueue" - + # The maximum space the queue should occupy on disk. Depending on # input settings, events that exceed this limit are delayed or discarded. #max_size: 10GB @@ -4257,7 +4248,7 @@ output.elasticsearch: # Permissions to use for file creation. The default is 0600. #permissions: 0600 - + # Configure automatic file rotation on every startup. The default is true. #rotate_on_startup: true # ------------------------------- Console Output ------------------------------- From 84677ab590fa83a6273e4e4ab5911dc4dae29613 Mon Sep 17 00:00:00 2001 From: Andrea Spacca Date: Thu, 12 Aug 2021 11:47:06 +0200 Subject: [PATCH 05/20] aws credentials and config --- x-pack/filebeat/input/awss3/config.go | 62 ++++++--- x-pack/filebeat/input/awss3/config_test.go | 145 ++++++++++++++++++--- x-pack/libbeat/common/aws/credentials.go | 12 +- 3 files changed, 180 insertions(+), 39 deletions(-) diff --git a/x-pack/filebeat/input/awss3/config.go b/x-pack/filebeat/input/awss3/config.go index d780f8eec895..566a79426006 100644 --- a/x-pack/filebeat/input/awss3/config.go +++ b/x-pack/filebeat/input/awss3/config.go @@ -19,48 +19,70 @@ import ( ) type config struct { - APITimeout time.Duration `config:"api_timeout"` - VisibilityTimeout time.Duration `config:"visibility_timeout"` - SQSWaitTime time.Duration `config:"sqs.wait_time"` // The max duration for which the SQS ReceiveMessage call waits for a message to arrive in the queue before returning. - SQSMaxReceiveCount int `config:"sqs.max_receive_count"` // The max number of times a message should be received (retried) before deleting it. - FIPSEnabled bool `config:"fips_enabled"` - MaxNumberOfMessages int `config:"max_number_of_messages"` - QueueURL string `config:"queue_url" validate:"required"` - AWSConfig awscommon.ConfigAWS `config:",inline"` - FileSelectors []fileSelectorConfig `config:"file_selectors"` - ReaderConfig readerConfig `config:",inline"` // Reader options to apply when no file_selectors are used. + APITimeout time.Duration `config:"api_timeout"` + VisibilityTimeout time.Duration `config:"visibility_timeout"` + SQSWaitTime time.Duration `config:"sqs.wait_time"` // The max duration for which the SQS ReceiveMessage call waits for a message to arrive in the queue before returning. + SQSMaxReceiveCount int `config:"sqs.max_receive_count"` // The max number of times a message should be received (retried) before deleting it. + FIPSEnabled bool `config:"fips_enabled"` + MaxNumberOfMessages int `config:"max_number_of_messages"` + QueueURL string `config:"queue_url"` + S3Bucket string `config:"s3_bucket"` + S3BucketPollInterval time.Duration `config:"s3_bucket_poll_interval"` + S3BucketNumberOfWorkers int `config:"s3_bucket_number_of_workers"` + AWSConfig awscommon.ConfigAWS `config:",inline"` + FileSelectors []fileSelectorConfig `config:"file_selectors"` + ReaderConfig readerConfig `config:",inline"` // Reader options to apply when no file_selectors are used. } func defaultConfig() config { c := config{ - APITimeout: 120 * time.Second, - VisibilityTimeout: 300 * time.Second, - SQSWaitTime: 20 * time.Second, - SQSMaxReceiveCount: 5, - FIPSEnabled: false, - MaxNumberOfMessages: 5, + APITimeout: 120 * time.Second, + VisibilityTimeout: 300 * time.Second, + S3BucketPollInterval: 120 * time.Second, + SQSWaitTime: 20 * time.Second, + SQSMaxReceiveCount: 5, + FIPSEnabled: false, + MaxNumberOfMessages: 5, } c.ReaderConfig.InitDefaults() return c } func (c *config) Validate() error { - if c.VisibilityTimeout <= 0 || c.VisibilityTimeout.Hours() > 12 { + if c.QueueURL == "" && c.S3Bucket == "" { + return fmt.Errorf("queue_url or s3_bucket must provided") + } + + if c.QueueURL != "" && c.S3Bucket != "" { + return fmt.Errorf("queue_url <%v> and s3_bucket <%v> "+ + "cannot be set at the same time", c.QueueURL, c.S3Bucket) + } + + if c.S3Bucket != "" && (c.S3BucketPollInterval <= 0 || c.S3BucketPollInterval.Hours() > 12) { + return fmt.Errorf("s3_bucket_poll_interval <%v> must be greater than 0 and "+ + "less than or equal to 12h", c.S3BucketPollInterval) + } + + if c.S3Bucket != "" && c.S3BucketNumberOfWorkers <= 0 { + return fmt.Errorf("s3_bucket_number_of_workers <%v> must be greater than 0", c.S3BucketNumberOfWorkers) + } + + if c.QueueURL != "" && (c.VisibilityTimeout <= 0 || c.VisibilityTimeout.Hours() > 12) { return fmt.Errorf("visibility_timeout <%v> must be greater than 0 and "+ "less than or equal to 12h", c.VisibilityTimeout) } - if c.SQSWaitTime <= 0 || c.SQSWaitTime.Seconds() > 20 { + if c.QueueURL != "" && (c.SQSWaitTime <= 0 || c.SQSWaitTime.Seconds() > 20) { return fmt.Errorf("wait_time <%v> must be greater than 0 and "+ "less than or equal to 20s", c.SQSWaitTime) } - if c.MaxNumberOfMessages <= 0 { + if c.QueueURL != "" && c.MaxNumberOfMessages <= 0 { return fmt.Errorf("max_number_of_messages <%v> must be greater than 0", c.MaxNumberOfMessages) } - if c.APITimeout < c.SQSWaitTime { + if c.QueueURL != "" && c.APITimeout < c.SQSWaitTime { return fmt.Errorf("api_timeout <%v> must be greater than the sqs.wait_time <%v", c.APITimeout, c.SQSWaitTime) } diff --git a/x-pack/filebeat/input/awss3/config_test.go b/x-pack/filebeat/input/awss3/config_test.go index 77e35bcb0f33..b50c71d576bd 100644 --- a/x-pack/filebeat/input/awss3/config_test.go +++ b/x-pack/filebeat/input/awss3/config_test.go @@ -8,9 +8,10 @@ import ( "testing" "time" + "github.com/stretchr/testify/require" + "github.com/dustin/go-humanize" "github.com/stretchr/testify/assert" - "github.com/stretchr/testify/require" "github.com/elastic/beats/v7/libbeat/common" "github.com/elastic/beats/v7/libbeat/common/match" @@ -20,19 +21,22 @@ import ( func TestConfig(t *testing.T) { const queueURL = "https://example.com" - makeConfig := func() config { + const s3Bucket = "arn:aws:s3:::aBucket" + makeConfig := func(quequeURL, s3Bucket string) config { // Have a separate copy of defaults in the test to make it clear when // anyone changes the defaults. parserConf := parser.Config{} require.NoError(t, parserConf.Unpack(common.MustNewConfigFrom(""))) return config{ - QueueURL: queueURL, - APITimeout: 120 * time.Second, - VisibilityTimeout: 300 * time.Second, - SQSMaxReceiveCount: 5, - SQSWaitTime: 20 * time.Second, - FIPSEnabled: false, - MaxNumberOfMessages: 5, + QueueURL: quequeURL, + S3Bucket: s3Bucket, + APITimeout: 120 * time.Second, + VisibilityTimeout: 300 * time.Second, + SQSMaxReceiveCount: 5, + SQSWaitTime: 20 * time.Second, + S3BucketPollInterval: 120 * time.Second, + FIPSEnabled: false, + MaxNumberOfMessages: 5, ReaderConfig: readerConfig{ BufferSize: 16 * humanize.KiByte, MaxBytes: 10 * humanize.MiByte, @@ -44,20 +48,41 @@ func TestConfig(t *testing.T) { testCases := []struct { name string + queueURL string + s3Bucket string config common.MapStr expectedErr string - expectedCfg func() config + expectedCfg func(queueURL, s3Bucket string) config }{ { - "input with defaults", + "input with defaults for queueURL", + queueURL, + "", common.MapStr{ "queue_url": queueURL, }, "", makeConfig, }, + { + "input with defaults for s3Bucket", + "", + s3Bucket, + common.MapStr{ + "s3_bucket": s3Bucket, + "s3_bucket_number_of_workers": 5, + }, + "", + func(queueURL, s3Bucket string) config { + c := makeConfig("", s3Bucket) + c.S3BucketNumberOfWorkers = 5 + return c + }, + }, { "input with file_selectors", + queueURL, + "", common.MapStr{ "queue_url": queueURL, "file_selectors": []common.MapStr{ @@ -67,8 +92,8 @@ func TestConfig(t *testing.T) { }, }, "", - func() config { - c := makeConfig() + func(queueURL, s3Bucketr string) config { + c := makeConfig(queueURL, "") regex := match.MustCompile("/CloudTrail/") c.FileSelectors = []fileSelectorConfig{ { @@ -79,17 +104,43 @@ func TestConfig(t *testing.T) { return c }, }, + { + "error on no queueURL and s3Bucket", + "", + "", + common.MapStr{ + "queue_url": "", + "s3_bucket": "", + }, + "queue_url or s3_bucket must provided", + nil, + }, + { + "error on both queueURL and s3Bucket", + queueURL, + s3Bucket, + common.MapStr{ + "queue_url": queueURL, + "s3_bucket": s3Bucket, + }, + "queue_url and s3_bucket cannot be set at the same time", + nil, + }, { "error on api_timeout == 0", + queueURL, + "", common.MapStr{ "queue_url": queueURL, "api_timeout": "0", }, - "api_timeout <0s> must be greater than the sqs.wait_time", + "api_timeout <0s> must be greater than the sqs.wait_time <20s", nil, }, { "error on visibility_timeout == 0", + queueURL, + "", common.MapStr{ "queue_url": queueURL, "visibility_timeout": "0", @@ -99,6 +150,8 @@ func TestConfig(t *testing.T) { }, { "error on visibility_timeout > 12h", + queueURL, + "", common.MapStr{ "queue_url": queueURL, "visibility_timeout": "12h1ns", @@ -106,8 +159,43 @@ func TestConfig(t *testing.T) { "visibility_timeout <12h0m0.000000001s> must be greater than 0 and less than or equal to 12h", nil, }, + { + "error on s3_bucket_poll_interval == 0", + "", + s3Bucket, + common.MapStr{ + "s3_bucket": s3Bucket, + "s3_bucket_poll_interval": "0", + }, + "s3_bucket_poll_interval <0s> must be greater than 0 and less than or equal to 12h", + nil, + }, + { + "error on s3_bucket_poll_interval > 12h", + "", + s3Bucket, + common.MapStr{ + "s3_bucket": s3Bucket, + "s3_bucket_poll_interval": "12h1ns", + }, + "s3_bucket_poll_interval <12h0m0.000000001s> must be greater than 0 and less than or equal to 12h", + nil, + }, + { + "error on s3_bucket_number_of_workers == 0", + "", + s3Bucket, + common.MapStr{ + "s3_bucket": s3Bucket, + "s3_bucket_number_of_workers": "0", + }, + "s3_bucket_number_of_workers <0> must be greater than 0", + nil, + }, { "error on max_number_of_messages == 0", + queueURL, + "", common.MapStr{ "queue_url": queueURL, "max_number_of_messages": "0", @@ -117,6 +205,8 @@ func TestConfig(t *testing.T) { }, { "error on buffer_size == 0 ", + queueURL, + "", common.MapStr{ "queue_url": queueURL, "buffer_size": "0", @@ -124,8 +214,21 @@ func TestConfig(t *testing.T) { "buffer_size <0> must be greater than 0", nil, }, + { + "error on max_bytes == 0 ", + queueURL, + "", + common.MapStr{ + "queue_url": queueURL, + "max_bytes": "0", + }, + "max_bytes <0> must be greater than 0", + nil, + }, { "error on expand_event_list_from_field and content_type != application/json ", + queueURL, + "", common.MapStr{ "queue_url": queueURL, "expand_event_list_from_field": "Records", @@ -134,6 +237,18 @@ func TestConfig(t *testing.T) { "content_type must be `application/json` when expand_event_list_from_field is used", nil, }, + { + "error on expand_event_list_from_field and content_type != application/json ", + "", + s3Bucket, + common.MapStr{ + "s3_bucket": s3Bucket, + "expand_event_list_from_field": "Records", + "content_type": "text/plain", + }, + "content_type must be `application/json` when expand_event_list_from_field is used", + nil, + }, } for _, tc := range testCases { @@ -152,7 +267,7 @@ func TestConfig(t *testing.T) { if tc.expectedCfg == nil { t.Fatal("missing expected config in test case") } - assert.EqualValues(t, tc.expectedCfg(), c) + assert.EqualValues(t, tc.expectedCfg(tc.queueURL, tc.s3Bucket), c) }) } } diff --git a/x-pack/libbeat/common/aws/credentials.go b/x-pack/libbeat/common/aws/credentials.go index 662da9d570a6..4ca498e1aa52 100644 --- a/x-pack/libbeat/common/aws/credentials.go +++ b/x-pack/libbeat/common/aws/credentials.go @@ -76,8 +76,10 @@ func getAccessKeys(config ConfigAWS) awssdk.Config { Value: awsCredentials, } - // Set default region to make initial aws api call - awsConfig.Region = "us-east-1" + // Set default region if empty to make initial aws api call + if awsConfig.Region == "" { + awsConfig.Region = "us-east-1" + } // Assume IAM role if iam_role config parameter is given if config.RoleArn != "" { @@ -112,8 +114,10 @@ func getSharedCredentialProfile(config ConfigAWS) (awssdk.Config, error) { return awsConfig, errors.Wrap(err, "external.LoadDefaultAWSConfig failed with shared credential profile given") } - // Set default region to make initial aws api call - awsConfig.Region = "us-east-1" + // Set default region if empty to make initial aws api call + if awsConfig.Region == "" { + awsConfig.Region = "us-east-1" + } // Assume IAM role if iam_role config parameter is given if config.RoleArn != "" { From 5ef9d1216aeb5c5e2ee8a3f0892a6d1adad731cd Mon Sep 17 00:00:00 2001 From: Andrea Spacca Date: Thu, 12 Aug 2021 12:14:01 +0200 Subject: [PATCH 06/20] state and states --- x-pack/filebeat/input/awss3/state.go | 62 +++++ x-pack/filebeat/input/awss3/state_test.go | 134 +++++++++ x-pack/filebeat/input/awss3/states.go | 309 +++++++++++++++++++++ x-pack/filebeat/input/awss3/states_test.go | 144 ++++++++++ 4 files changed, 649 insertions(+) create mode 100644 x-pack/filebeat/input/awss3/state.go create mode 100644 x-pack/filebeat/input/awss3/state_test.go create mode 100644 x-pack/filebeat/input/awss3/states.go create mode 100644 x-pack/filebeat/input/awss3/states_test.go diff --git a/x-pack/filebeat/input/awss3/state.go b/x-pack/filebeat/input/awss3/state.go new file mode 100644 index 000000000000..aefd59414b1f --- /dev/null +++ b/x-pack/filebeat/input/awss3/state.go @@ -0,0 +1,62 @@ +// Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one +// or more contributor license agreements. Licensed under the Elastic License; +// you may not use this file except in compliance with the Elastic License. + +package awss3 + +import ( + "fmt" + "time" +) + +// state is used to communicate the reading state of a file +type state struct { + Id string `json:"id" struct:"id"` + Bucket string `json:"bucket" struct:"bucket"` + Key string `json:"key" struct:"key"` + Etag string `json:"etag" struct:"etag"` + LastModified time.Time `json:"last_modified" struct:"last_modifed"` + Stored bool `json:"stored" struct:"stored"` +} + +// newState creates a new s3 object state +func newState(bucket, key, etag string, lastModified time.Time) state { + s := state{ + Bucket: bucket, + Key: key, + LastModified: lastModified, + Etag: etag, + Stored: false, + } + + s.Id = s.Bucket + s.Key + s.Etag + s.LastModified.String() + + return s +} + +// MarkAsStored set the stored flag to true +func (s *state) MarkAsStored() { + s.Stored = true +} + +// IsEqual checks if the two states point to the same file. +func (s *state) IsEqual(c *state) bool { + return s.Bucket == c.Bucket && s.Key == c.Key && s.Etag == c.Etag && s.LastModified.Equal(c.LastModified) +} + +// IsEmpty checks if the state is empty +func (s *state) IsEmpty() bool { + c := state{} + return s.Bucket == c.Bucket && s.Key == c.Key && s.Etag == c.Etag && s.LastModified.Equal(c.LastModified) +} + +// String returns string representation of the struct +func (s *state) String() string { + return fmt.Sprintf( + "{Id: %v, Bucket: %v, Key: %v, Etag: %v, LastModified: %v}", + s.Id, + s.Bucket, + s.Key, + s.Etag, + s.LastModified) +} diff --git a/x-pack/filebeat/input/awss3/state_test.go b/x-pack/filebeat/input/awss3/state_test.go new file mode 100644 index 000000000000..07db57329673 --- /dev/null +++ b/x-pack/filebeat/input/awss3/state_test.go @@ -0,0 +1,134 @@ +// Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one +// or more contributor license agreements. Licensed under the Elastic License; +// you may not use this file except in compliance with the Elastic License. + +package awss3 + +import ( + "testing" + "time" + + "github.com/stretchr/testify/assert" +) + +func TestStateIsEqual(t *testing.T) { + type stateTestCase struct { + states [2]state + isSame bool + } + + lastModifed := time.Now() + tests := map[string]stateTestCase{ + "two states pointing to the same key with same etag and same last modified not stored": { + [2]state{ + state{ + Bucket: "bucket a", + Key: "/key/to/this/file/1", + Etag: "etag", + LastModified: lastModifed, + }, + state{ + Bucket: "bucket a", + Key: "/key/to/this/file/1", + Etag: "etag", + LastModified: lastModifed, + }, + }, + true, + }, + "two states pointing to the same key with same etag and same last modified stored": { + [2]state{ + state{ + Bucket: "bucket a", + Key: "/key/to/this/file/1", + Etag: "etag", + LastModified: lastModifed, + Stored: true, + }, + state{ + Bucket: "bucket a", + Key: "/key/to/this/file/1", + Etag: "etag", + LastModified: lastModifed, + }, + }, + true, + }, + "two states pointing to the same key with different etag and same last modified": { + [2]state{ + state{ + Bucket: "bucket a", + Key: "/key/to/this/file/1", + Etag: "etag1", + LastModified: lastModifed, + }, + state{ + Bucket: "bucket a", + Key: "/key/to/this/file/1", + Etag: "etag2", + LastModified: lastModifed, + }, + }, + false, + }, + "two states pointing to the same key with same etag and different last modified": { + [2]state{ + state{ + Bucket: "bucket a", + Key: "/key/to/this/file/1", + Etag: "etag", + LastModified: time.Now(), + }, + state{ + Bucket: "bucket a", + Key: "/key/to/this/file/1", + Etag: "etag", + LastModified: time.Now().Add(10 * time.Second), + }, + }, + false, + }, + "two states pointing to different key": { + [2]state{ + state{ + Bucket: "bucket a", + Key: "/key/to/this/file/1", + Etag: "etag", + LastModified: lastModifed, + }, + state{ + Bucket: "bucket a", + Key: "/key/to/this/file/2", + Etag: "etag", + LastModified: lastModifed, + }, + }, + false, + }, + "two states pointing to different bucket": { + [2]state{ + state{ + Bucket: "bucket b", + Key: "/key/to/this/file/1", + Etag: "etag", + LastModified: lastModifed, + }, + state{ + Bucket: "bucket a", + Key: "/key/to/this/file/1", + Etag: "etag", + LastModified: lastModifed, + }, + }, + false, + }, + } + + for name, test := range tests { + test := test + t.Run(name, func(t *testing.T) { + isSame := test.states[0].IsEqual(&test.states[1]) + assert.Equal(t, test.isSame, isSame) + }) + } +} diff --git a/x-pack/filebeat/input/awss3/states.go b/x-pack/filebeat/input/awss3/states.go new file mode 100644 index 000000000000..de7abacf996b --- /dev/null +++ b/x-pack/filebeat/input/awss3/states.go @@ -0,0 +1,309 @@ +// Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one +// or more contributor license agreements. Licensed under the Elastic License; +// you may not use this file except in compliance with the Elastic License. + +package awss3 + +import ( + "strings" + "sync" + + "github.com/elastic/beats/v7/libbeat/statestore" + + "github.com/elastic/beats/v7/libbeat/logp" +) + +const ( + awsS3ObjectStatePrefix = "filebeat::aws-s3::state::" + awsS3WriteCommitPrefix = "filebeat::aws-s3::writeCommit::" +) + +// storedOp keeps track of pending updates that are not written to the persistent store yet. +// Update operations are ordered. The input manager guarantees that only one +// input can create update operation for a source, such that new input +// instances can add update operations to be executed after already pending +// update operations from older inputs instances that have been shutdown. +type storedOp struct { + states *states + store *statestore.Store + lockMap *sync.Map +} + +type listingInfo struct { + totObjects int + storedObjects int +} + +// states handles list of s3 object state. One must use newStates to instantiate a +// file states registry. Using the zero-value is not safe. +type states struct { + sync.RWMutex + + // states store + states []state + + // idx maps state IDs to state indexes for fast lookup and modifications. + idx map[string]int + + listingIDs map[string]struct{} + listingInfo *sync.Map + statesByListingID map[string][]state +} + +// newStates generates a new states registry. +func newStates() *states { + return &states{ + states: nil, + idx: map[string]int{}, + listingInfo: new(sync.Map), + listingIDs: map[string]struct{}{}, + statesByListingID: map[string][]state{}, + } +} + +func (s *states) Delete(id string) { + s.Lock() + defer s.Unlock() + + index := s.findPrevious(id) + if index >= 0 { + last := len(s.states) - 1 + s.states[last], s.states[index] = s.states[index], s.states[last] + s.states = s.states[:last] + } + + s.idx = map[string]int{} + for i, state := range s.states { + s.idx[state.Id] = i + } +} + +// IsListingFullyStored check if listing if fully stored +func (s *states) IsListingFullyStored(listingID string) bool { + info, _ := s.listingInfo.Load(listingID) + return info.(listingInfo).storedObjects == info.(listingInfo).totObjects +} + +// AddListing add listing info +func (s *states) AddListing(listingID string, listingInfo listingInfo) { + s.Lock() + defer s.Unlock() + s.listingIDs[listingID] = struct{}{} + s.listingInfo.Store(listingID, listingInfo) +} + +// DeleteListing delete listing info +func (s *states) DeleteListing(listingID string) { + s.Lock() + defer s.Unlock() + delete(s.listingIDs, listingID) + delete(s.statesByListingID, listingID) + s.listingInfo.Delete(listingID) +} + +// Update updates a state. If previous state didn't exist, new one is created +func (s *states) Update(newState state, listingID string) { + s.Lock() + defer s.Unlock() + + id := newState.Bucket + newState.Key + index := s.findPrevious(id) + + if index >= 0 { + s.states[index] = newState + } else { + // No existing state found, add new one + s.idx[id] = len(s.states) + s.states = append(s.states, newState) + logp.Debug("input", "New state added for %s", newState.Key) + } + + if listingID == "" || !newState.Stored { + return + } + + // listing map is shared with the collector + // here we increase the number of stored object + info, _ := s.listingInfo.Load(listingID) + listingInfo := info.(listingInfo) + listingInfo.storedObjects++ + s.listingInfo.Store(listingID, listingInfo) + + if _, ok := s.statesByListingID[listingID]; !ok { + s.statesByListingID[listingID] = make([]state, 0) + } + + s.statesByListingID[listingID] = append(s.statesByListingID[listingID], newState) +} + +// FindPrevious lookups a registered state, that matching the new state. +// Returns a zero-state if no match is found. +func (s *states) FindPrevious(newState state) state { + s.RLock() + defer s.RUnlock() + id := newState.Bucket + newState.Key + i := s.findPrevious(id) + if i < 0 { + return state{} + } + return s.states[i] +} + +// FindPreviousByID lookups a registered state, that matching the id. +// Returns a zero-state if no match is found. +func (s *states) FindPreviousByID(id string) state { + s.RLock() + defer s.RUnlock() + i := s.findPrevious(id) + if i < 0 { + return state{} + } + return s.states[i] +} + +func (s *states) IsNew(state state) bool { + s.RLock() + defer s.RUnlock() + id := state.Bucket + state.Key + i := s.findPrevious(id) + + if i < 0 { + return true + } + + return !s.states[i].IsEqual(&state) +} + +// findPrevious returns the previous state for the file. +// In case no previous state exists, index -1 is returned +func (s *states) findPrevious(id string) int { + if i, exists := s.idx[id]; exists { + return i + } + return -1 +} + +// GetStates creates copy of the file states. +func (s *states) GetStates() []state { + s.RLock() + defer s.RUnlock() + + newStates := make([]state, len(s.states)) + copy(newStates, s.states) + + return newStates +} + +// GetListingIDs return a of the listing IDs +func (s *states) GetListingIDs() []string { + s.RLock() + defer s.RUnlock() + listingIDs := make([]string, 0, len(s.listingIDs)) + for listingID := range s.listingIDs { + listingIDs = append(listingIDs, listingID) + } + + return listingIDs +} + +// GetStatesByListingID return a copy of the states by listing ID +func (s *states) GetStatesByListingID(listingID string) []state { + s.RLock() + defer s.RUnlock() + + if _, ok := s.statesByListingID[listingID]; !ok { + return nil + } + + newStates := make([]state, len(s.statesByListingID[listingID])) + copy(newStates, s.statesByListingID[listingID]) + return newStates +} + +func (s *states) readStatesFrom(store *statestore.Store) error { + var states []state + + err := store.Each(func(key string, dec statestore.ValueDecoder) (bool, error) { + if !strings.HasPrefix(key, awsS3ObjectStatePrefix) { + return true, nil + } + + // try to decode. Ingore faulty/incompatible values. + var st state + if err := dec.Decode(&st); err != nil { + // XXX: Do we want to log here? In case we start to store other + // state types in the registry, then this operation will likely fail + // quite often, producing some false-positives in the logs... + return true, nil + } + + st.Id = key[len(awsS3ObjectStatePrefix):] + states = append(states, st) + return true, nil + }) + + if err != nil { + return err + } + + states = fixStates(states) + + for _, state := range states { + s.Update(state, "") + } + + return nil +} + +// fixStates cleans up the registry states when updating from an older version +// of filebeat potentially writing invalid entries. +func fixStates(states []state) []state { + if len(states) == 0 { + return states + } + + // we use a map of states here, so to identify and merge duplicate entries. + idx := map[string]*state{} + for i := range states { + state := &states[i] + + old, exists := idx[state.Id] + if !exists { + idx[state.Id] = state + } else { + mergeStates(old, state) // overwrite the entry in 'old' + } + } + + if len(idx) == len(states) { + return states + } + + i := 0 + newStates := make([]state, len(idx)) + for _, state := range idx { + newStates[i] = *state + i++ + } + return newStates +} + +// mergeStates merges 2 states by trying to determine the 'newer' state. +// The st state is overwritten with the updated fields. +func mergeStates(st, other *state) { + // update file meta-data. As these are updated concurrently by the + // inputs, select the newer state based on the update timestamp. + if st.LastModified.Before(other.LastModified) { + st.LastModified = other.LastModified + } +} + +func (s *states) writeStates(store *statestore.Store) error { + for _, state := range s.GetStates() { + key := awsS3ObjectStatePrefix + state.Id + if err := store.Set(key, state); err != nil { + return err + } + } + return nil +} diff --git a/x-pack/filebeat/input/awss3/states_test.go b/x-pack/filebeat/input/awss3/states_test.go new file mode 100644 index 000000000000..952bc98e9b76 --- /dev/null +++ b/x-pack/filebeat/input/awss3/states_test.go @@ -0,0 +1,144 @@ +// Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one +// or more contributor license agreements. Licensed under the Elastic License; +// you may not use this file except in compliance with the Elastic License. + +package awss3 + +import ( + "testing" + "time" + + "github.com/stretchr/testify/assert" +) + +func TestStatesDelete(t *testing.T) { + type stateTestCase struct { + states func() *states + deleteID string + expected []state + } + + lastModified := time.Date(2021, time.July, 22, 18, 38, 00, 0, time.UTC) + tests := map[string]stateTestCase{ + "delete empty states": { + states: func() *states { + return newStates() + }, + deleteID: "an id", + expected: []state{}, + }, + "delete not existing state": { + states: func() *states { + states := newStates() + states.Update(newState("bucket", "key", "etag", lastModified), "") + return states + }, + deleteID: "an id", + expected: []state{ + { + Id: "bucketkeyetag" + lastModified.String(), + Bucket: "bucket", + Key: "key", + Etag: "etag", + LastModified: lastModified, + }, + }, + }, + "delete only one existing": { + states: func() *states { + states := newStates() + states.Update(newState("bucket", "key", "etag", lastModified), "") + return states + }, + deleteID: "bucketkey", + expected: []state{}, + }, + "delete first": { + states: func() *states { + states := newStates() + states.Update(newState("bucket", "key1", "etag1", lastModified), "") + states.Update(newState("bucket", "key2", "etag2", lastModified), "") + states.Update(newState("bucket", "key3", "etag3", lastModified), "") + return states + }, + deleteID: "bucketkey1", + expected: []state{ + { + Id: "bucketkey3etag3" + lastModified.String(), + Bucket: "bucket", + Key: "key3", + Etag: "etag3", + LastModified: lastModified, + }, + { + Id: "bucketkey2etag2" + lastModified.String(), + Bucket: "bucket", + Key: "key2", + Etag: "etag2", + LastModified: lastModified, + }, + }, + }, + "delete last": { + states: func() *states { + states := newStates() + states.Update(newState("bucket", "key1", "etag1", lastModified), "") + states.Update(newState("bucket", "key2", "etag2", lastModified), "") + states.Update(newState("bucket", "key3", "etag3", lastModified), "") + return states + }, + deleteID: "bucketkey3", + expected: []state{ + { + Id: "bucketkey1etag1" + lastModified.String(), + Bucket: "bucket", + Key: "key1", + Etag: "etag1", + LastModified: lastModified, + }, + { + Id: "bucketkey2etag2" + lastModified.String(), + Bucket: "bucket", + Key: "key2", + Etag: "etag2", + LastModified: lastModified, + }, + }, + }, + "delete any": { + states: func() *states { + states := newStates() + states.Update(newState("bucket", "key1", "etag1", lastModified), "") + states.Update(newState("bucket", "key2", "etag2", lastModified), "") + states.Update(newState("bucket", "key3", "etag3", lastModified), "") + return states + }, + deleteID: "bucketkey2", + expected: []state{ + { + Id: "bucketkey1etag1" + lastModified.String(), + Bucket: "bucket", + Key: "key1", + Etag: "etag1", + LastModified: lastModified, + }, + { + Id: "bucketkey3etag3" + lastModified.String(), + Bucket: "bucket", + Key: "key3", + Etag: "etag3", + LastModified: lastModified, + }, + }, + }, + } + + for name, test := range tests { + test := test + t.Run(name, func(t *testing.T) { + states := test.states() + states.Delete(test.deleteID) + assert.Equal(t, test.expected, states.GetStates()) + }) + } +} From e4e595044ff333b18b29d7bb2b03fff04bb9afe7 Mon Sep 17 00:00:00 2001 From: Andrea Spacca Date: Thu, 12 Aug 2021 18:56:53 +0200 Subject: [PATCH 07/20] s3 bucket polling refactored --- x-pack/filebeat/input/awss3/input.go | 122 +++- x-pack/filebeat/input/awss3/interfaces.go | 5 + .../input/awss3/mock_interfaces_test.go | 19 +- .../input/awss3/mock_publisher_test.go | 7 +- x-pack/filebeat/input/awss3/s3.go | 611 +++++++----------- x-pack/filebeat/input/awss3/s3_objects.go | 453 +++++++++++++ .../filebeat/input/awss3/s3_objects_test.go | 261 ++++++++ x-pack/filebeat/input/awss3/s3_test.go | 441 ++++++------- x-pack/filebeat/input/awss3/state.go | 8 +- x-pack/filebeat/input/awss3/states.go | 43 +- .../filebeat/input/default-inputs/inputs.go | 2 +- 11 files changed, 1348 insertions(+), 624 deletions(-) create mode 100644 x-pack/filebeat/input/awss3/s3_objects.go create mode 100644 x-pack/filebeat/input/awss3/s3_objects_test.go diff --git a/x-pack/filebeat/input/awss3/input.go b/x-pack/filebeat/input/awss3/input.go index 82c01778c137..cd8a54d744e4 100644 --- a/x-pack/filebeat/input/awss3/input.go +++ b/x-pack/filebeat/input/awss3/input.go @@ -10,6 +10,10 @@ import ( "net/url" "strings" + "github.com/elastic/go-concert/unison" + + "github.com/elastic/beats/v7/filebeat/beater" + awssdk "github.com/aws/aws-sdk-go-v2/aws" "github.com/aws/aws-sdk-go-v2/service/s3" "github.com/aws/aws-sdk-go-v2/service/sqs" @@ -19,51 +23,57 @@ import ( "github.com/elastic/beats/v7/libbeat/common" "github.com/elastic/beats/v7/libbeat/feature" "github.com/elastic/beats/v7/libbeat/monitoring" + "github.com/elastic/beats/v7/libbeat/statestore" awscommon "github.com/elastic/beats/v7/x-pack/libbeat/common/aws" ) const inputName = "aws-s3" -func Plugin() v2.Plugin { +func Plugin(store beater.StateStore) v2.Plugin { return v2.Plugin{ Name: inputName, Stability: feature.Stable, Deprecated: false, Info: "Collect logs from s3", - Manager: v2.ConfigureWith(configure), + Manager: &s3InputManager{store: store}, } } -func configure(cfg *common.Config) (v2.Input, error) { +type s3InputManager struct { + s3Input *s3Input + store beater.StateStore +} + +func (im *s3InputManager) Init(grp unison.Group, mode v2.Mode) error { + return nil +} + +func (im *s3InputManager) Create(cfg *common.Config) (v2.Input, error) { config := defaultConfig() if err := cfg.Unpack(&config); err != nil { return nil, err } - return newInput(config) + return newInput(config, im.store) } // s3Input is a input for reading logs from S3 when triggered by an SQS message. type s3Input struct { config config awsConfig awssdk.Config + store beater.StateStore } -func newInput(config config) (*s3Input, error) { +func newInput(config config, store beater.StateStore) (*s3Input, error) { awsConfig, err := awscommon.InitializeAWSConfig(config.AWSConfig) if err != nil { return nil, fmt.Errorf("failed to initialize AWS credentials: %w", err) } - regionName, err := getRegionFromQueueURL(config.QueueURL, config.AWSConfig.Endpoint) - if err != nil { - return nil, fmt.Errorf("failed to get AWS region from queue_url: %w", err) - } - awsConfig.Region = regionName - return &s3Input{ config: config, awsConfig: awsConfig, + store: store, }, nil } @@ -74,6 +84,23 @@ func (in *s3Input) Test(ctx v2.TestContext) error { } func (in *s3Input) Run(inputContext v2.Context, pipeline beat.Pipeline) error { + var err error + + persistentStore, err := in.store.Access() + if err != nil { + return fmt.Errorf("Can not access persistent store: %w", err) + } + + defer func() { + persistentStore.Close() + }() + + states := newStates() + err = states.readStatesFrom(persistentStore) + if err != nil { + return fmt.Errorf("Can not start persistent store: %w", err) + } + // Wrap input Context's cancellation Done channel a context.Context. This // goroutine stops with the parent closes the Done channel. ctx, cancelInputCtx := context.WithCancel(context.Background()) @@ -96,15 +123,37 @@ func (in *s3Input) Run(inputContext v2.Context, pipeline beat.Pipeline) error { } defer client.Close() - // Create SQS receiver and S3 notification processor. - receiver, err := in.createSQSReceiver(inputContext, client) - if err != nil { - return fmt.Errorf("failed to initialize sqs receiver: %w", err) + if in.config.QueueURL != "" { + regionName, err := getRegionFromQueueURL(in.config.QueueURL, in.config.AWSConfig.Endpoint) + if err != nil { + return fmt.Errorf("failed to get AWS region from queue_url: %w", err) + } + + in.awsConfig.Region = regionName + + // Create SQS receiver and S3 notification processor. + receiver, err := in.createSQSReceiver(inputContext, client) + if err != nil { + return fmt.Errorf("failed to initialize sqs receiver: %w", err) + } + defer receiver.metrics.Close() + + if err := receiver.Receive(ctx); err != nil { + return err + } } - defer receiver.metrics.Close() - if err := receiver.Receive(ctx); err != nil { - return err + if in.config.S3Bucket != "" { + // Create S3 receiver and S3 notification processor. + poller, err := in.createS3Lister(inputContext, client, persistentStore, states) + if err != nil { + return fmt.Errorf("failed to initialize sqs receiver: %w", err) + } + defer poller.metrics.Close() + + if err := poller.Poll(ctx); err != nil { + return err + } } return nil @@ -149,6 +198,43 @@ func (in *s3Input) createSQSReceiver(ctx v2.Context, client beat.Client) (*sqsRe return sqsReader, nil } +func (in *s3Input) createS3Lister(ctx v2.Context, client beat.Client, persistentStore *statestore.Store, states *states) (*s3Poller, error) { + s3ServiceName := "s3" + if in.config.FIPSEnabled { + s3ServiceName = "s3-fips" + } + + s3API := &awsS3API{ + client: s3.New(awscommon.EnrichAWSConfigWithEndpoint(in.config.AWSConfig.Endpoint, s3ServiceName, in.awsConfig.Region, in.awsConfig)), + } + + log := ctx.Logger.With("s3_bucket", in.config.S3Bucket) + log.Infof("s3_bucket_number_of_workers is set to %v.", in.config.S3BucketNumberOfWorkers) + log.Infof("s3_bucket_poll_interval is set to %v.", in.config.S3BucketPollInterval) + log.Infof("AWS region is set to %v.", in.awsConfig.Region) + log.Debugf("AWS S3 service name is %v.", s3ServiceName) + + metricRegistry := monitoring.GetNamespace("dataset").GetRegistry() + metrics := newInputMetrics(metricRegistry, ctx.ID) + + fileSelectors := in.config.FileSelectors + if len(in.config.FileSelectors) == 0 { + fileSelectors = []fileSelectorConfig{{ReaderConfig: in.config.ReaderConfig}} + } + s3EventHandlerFactory := newS3ObjectProcessorFactory(log.Named("s3"), metrics, s3API, client, fileSelectors) + s3Poller := newS3Poller(log.Named("s3_poller"), + metrics, + s3API, + s3EventHandlerFactory, + states, + persistentStore, + in.config.S3Bucket, + in.config.S3BucketNumberOfWorkers, + in.config.S3BucketPollInterval) + + return s3Poller, nil +} + func getRegionFromQueueURL(queueURL string, endpoint string) (string, error) { // get region from queueURL // Example: https://sqs.us-east-1.amazonaws.com/627959692251/test-s3-logs diff --git a/x-pack/filebeat/input/awss3/interfaces.go b/x-pack/filebeat/input/awss3/interfaces.go index c03f94f10897..2e4067173487 100644 --- a/x-pack/filebeat/input/awss3/interfaces.go +++ b/x-pack/filebeat/input/awss3/interfaces.go @@ -89,6 +89,11 @@ type s3ObjectHandler interface { // the publisher before returning (use eventACKTracker's Wait() method to // determine this). ProcessS3Object() error + + // Wait waits for every event published by ProcessS3Object() to be ACKed + // by the publisher before returning. Internally it uses the + // s3ObjectHandler eventACKTracker's Wait() method + Wait() } // ------ diff --git a/x-pack/filebeat/input/awss3/mock_interfaces_test.go b/x-pack/filebeat/input/awss3/mock_interfaces_test.go index 4ee00850aa5e..d05381786aab 100644 --- a/x-pack/filebeat/input/awss3/mock_interfaces_test.go +++ b/x-pack/filebeat/input/awss3/mock_interfaces_test.go @@ -1,7 +1,3 @@ -// Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one -// or more contributor license agreements. Licensed under the Elastic License; -// you may not use this file except in compliance with the Elastic License. - // Code generated by MockGen. DO NOT EDIT. // Source: interfaces.go @@ -15,9 +11,8 @@ import ( s3 "github.com/aws/aws-sdk-go-v2/service/s3" sqs "github.com/aws/aws-sdk-go-v2/service/sqs" - gomock "github.com/golang/mock/gomock" - logp "github.com/elastic/beats/v7/libbeat/logp" + gomock "github.com/golang/mock/gomock" ) // MockSQSAPI is a mock of sqsAPI interface. @@ -500,3 +495,15 @@ func (mr *MockS3ObjectHandlerMockRecorder) ProcessS3Object() *gomock.Call { mr.mock.ctrl.T.Helper() return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "ProcessS3Object", reflect.TypeOf((*MockS3ObjectHandler)(nil).ProcessS3Object)) } + +// Wait mocks base method. +func (m *MockS3ObjectHandler) Wait() { + m.ctrl.T.Helper() + m.ctrl.Call(m, "Wait") +} + +// Wait indicates an expected call of Wait. +func (mr *MockS3ObjectHandlerMockRecorder) Wait() *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "Wait", reflect.TypeOf((*MockS3ObjectHandler)(nil).Wait)) +} diff --git a/x-pack/filebeat/input/awss3/mock_publisher_test.go b/x-pack/filebeat/input/awss3/mock_publisher_test.go index 7fa935496aad..40c46062a38d 100644 --- a/x-pack/filebeat/input/awss3/mock_publisher_test.go +++ b/x-pack/filebeat/input/awss3/mock_publisher_test.go @@ -1,7 +1,3 @@ -// Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one -// or more contributor license agreements. Licensed under the Elastic License; -// you may not use this file except in compliance with the Elastic License. - // Code generated by MockGen. DO NOT EDIT. // Source: github.com/elastic/beats/v7/libbeat/beat (interfaces: Client) @@ -11,9 +7,8 @@ package awss3 import ( reflect "reflect" - gomock "github.com/golang/mock/gomock" - beat "github.com/elastic/beats/v7/libbeat/beat" + gomock "github.com/golang/mock/gomock" ) // MockBeatClient is a mock of Client interface. diff --git a/x-pack/filebeat/input/awss3/s3.go b/x-pack/filebeat/input/awss3/s3.go index 8697b6589bd0..c67266d61ef9 100644 --- a/x-pack/filebeat/input/awss3/s3.go +++ b/x-pack/filebeat/input/awss3/s3.go @@ -5,445 +5,332 @@ package awss3 import ( - "bufio" - "bytes" - "compress/gzip" "context" - "crypto/sha256" - "encoding/hex" - "encoding/json" - "fmt" - "io" - "io/ioutil" - "net/http" - "reflect" + "net/url" "strings" + "sync" "time" - "github.com/aws/aws-sdk-go-v2/service/s3" + "github.com/elastic/beats/v7/libbeat/statestore" + + "github.com/gofrs/uuid" "github.com/pkg/errors" + "go.uber.org/multierr" - "github.com/elastic/beats/v7/libbeat/beat" - "github.com/elastic/beats/v7/libbeat/common" "github.com/elastic/beats/v7/libbeat/logp" "github.com/elastic/beats/v7/libbeat/monitoring" - "github.com/elastic/beats/v7/libbeat/reader" - "github.com/elastic/beats/v7/libbeat/reader/readfile" - "github.com/elastic/beats/v7/libbeat/reader/readfile/encoding" -) - -const ( - contentTypeJSON = "application/json" - contentTypeNDJSON = "application/x-ndjson" ) -type s3ObjectProcessorFactory struct { - log *logp.Logger - metrics *inputMetrics - s3 s3Getter - publisher beat.Client - fileSelectors []fileSelectorConfig +type commitWriteState struct { + time.Time } -func newS3ObjectProcessorFactory(log *logp.Logger, metrics *inputMetrics, s3 s3Getter, publisher beat.Client, sel []fileSelectorConfig) *s3ObjectProcessorFactory { - if metrics == nil { - metrics = newInputMetrics(monitoring.NewRegistry(), "") - } - if len(sel) == 0 { - sel = []fileSelectorConfig{ - {ReaderConfig: defaultConfig().ReaderConfig}, - } - } - return &s3ObjectProcessorFactory{ - log: log, - metrics: metrics, - s3: s3, - publisher: publisher, - fileSelectors: sel, - } +type s3ObjectInfo struct { + name string + key string + etag string + lastModified time.Time + listingID string } -func (f *s3ObjectProcessorFactory) findReaderConfig(key string) *readerConfig { - for _, sel := range f.fileSelectors { - if sel.Regex == nil || sel.Regex.MatchString(key) { - return &sel.ReaderConfig - } - } - return nil +type s3ObjectPayload struct { + s3ObjectHandler s3ObjectHandler + s3ObjectInfo s3ObjectInfo + s3ObjectEvent s3EventV2 +} +type s3Poller struct { + numberOfWorkers int + bucket string + bucketPollInterval time.Duration + workerSem *sem + s3 s3API + log *logp.Logger + metrics *inputMetrics + s3ObjectHandler s3ObjectHandlerFactory + states *states + store *statestore.Store + workersListingMap *sync.Map + workersProcessingMap *sync.Map } -// Create returns a new s3ObjectProcessor. It returns nil when no file selectors -// match the S3 object key. -func (f *s3ObjectProcessorFactory) Create(ctx context.Context, log *logp.Logger, ack *eventACKTracker, obj s3EventV2) s3ObjectHandler { - log = log.With( - "s3_bucket", obj.S3.Bucket.Name, - "s3_object", obj.S3.Object.Key) - - readerConfig := f.findReaderConfig(obj.S3.Object.Key) - if readerConfig == nil { - log.Debug("Skipping S3 object processing. No file_selectors are a match.") - return nil +func newS3Poller(log *logp.Logger, + metrics *inputMetrics, + s3 s3API, + s3ObjectHandler s3ObjectHandlerFactory, + states *states, + store *statestore.Store, + bucket string, + numberOfWorkers int, + bucketPollInterval time.Duration) *s3Poller { + if metrics == nil { + metrics = newInputMetrics(monitoring.NewRegistry(), "") } - - return &s3ObjectProcessor{ - s3ObjectProcessorFactory: f, - log: log, - ctx: ctx, - acker: ack, - readerConfig: readerConfig, - s3Obj: obj, - s3ObjHash: s3ObjectHash(obj), + return &s3Poller{ + numberOfWorkers: numberOfWorkers, + bucket: bucket, + bucketPollInterval: bucketPollInterval, + workerSem: newSem(numberOfWorkers), + s3: s3, + log: log, + metrics: metrics, + s3ObjectHandler: s3ObjectHandler, + states: states, + store: store, + workersListingMap: new(sync.Map), + workersProcessingMap: new(sync.Map), } } -type s3ObjectProcessor struct { - *s3ObjectProcessorFactory +func (p *s3Poller) process(s3ObjectPayloadChan <-chan *s3ObjectPayload) error { + var errs []error - log *logp.Logger - ctx context.Context - acker *eventACKTracker // ACKer tied to the SQS message (multiple S3 readers share an ACKer when the S3 notification event contains more than one S3 object). - readerConfig *readerConfig // Config about how to process the object. - s3Obj s3EventV2 // S3 object information. - s3ObjHash string - - s3Metadata map[string]interface{} // S3 object metadata. -} +processingLoop: + for { + select { + case s3ObjectPayload := <-s3ObjectPayloadChan: + if s3ObjectPayload == nil { + break processingLoop + } -func (p *s3ObjectProcessor) ProcessS3Object() error { - if p == nil { - return nil - } + info := s3ObjectPayload.s3ObjectInfo + state := newState(info.name, info.key, info.etag, info.lastModified) + //check if another worker already is on it + dummyValue := struct{}{} + _, loaded := p.workersProcessingMap.LoadOrStore(state.Id, dummyValue) + if loaded { + // another worker is processing the state + continue + } - // Metrics and Logging - { - p.log.Debug("Begin S3 object processing.") - p.metrics.s3ObjectsRequestedTotal.Inc() - p.metrics.s3ObjectsInflight.Inc() - start := time.Now() - defer func() { - elapsed := time.Since(start) - p.metrics.s3ObjectsInflight.Dec() - p.metrics.s3ObjectProcessingTime.Update(elapsed.Nanoseconds()) - p.log.Debugw("End S3 object processing.", "elapsed_time_ns", elapsed) - }() - } + // Process S3 object (download, parse, create events). + err := s3ObjectPayload.s3ObjectHandler.ProcessS3Object() + // Wait for all events to be ACKed before proceeding. + s3ObjectPayload.s3ObjectHandler.Wait() - // Request object (download). - contentType, meta, body, err := p.download() - if err != nil { - return errors.Wrap(err, "failed to get s3 object") - } - defer body.Close() - p.s3Metadata = meta + if err != nil { + event := s3ObjectPayload.s3ObjectEvent + errs = append(errs, errors.Wrapf(err, + "failed processing S3 event for object key %q in bucket %q", + event.S3.Object.Key, event.S3.Bucket.Name)) - reader, err := p.addGzipDecoderIfNeeded(newMonitoredReader(body, p.metrics.s3BytesProcessedTotal)) - if err != nil { - return errors.Wrap(err, "failed checking for gzip content") - } + // Manage locks for processing. + p.workersProcessingMap.Delete(state.Id) - // Overwrite with user configured Content-Type. - if p.readerConfig.ContentType != "" { - contentType = p.readerConfig.ContentType - } - - // Process object content stream. - switch { - case contentType == contentTypeJSON || contentType == contentTypeNDJSON: - err = p.readJSON(reader) - default: - err = p.readFile(reader) - } - if err != nil { - return err - } + continue - return nil -} + } -// download requests the S3 object from AWS and returns the object's -// Content-Type and reader to get the object's contents. The caller must -// close the returned reader. -func (p *s3ObjectProcessor) download() (contentType string, metadata map[string]interface{}, body io.ReadCloser, err error) { - resp, err := p.s3.GetObject(p.ctx, p.s3Obj.S3.Bucket.Name, p.s3Obj.S3.Object.Key) - if err != nil { - return "", nil, nil, err - } - meta := s3Metadata(resp, p.readerConfig.IncludeS3Metadata...) - return *resp.ContentType, meta, resp.Body, nil -} + // Manage locks for purging. + id := info.name + info.key + previousState := p.states.FindPreviousByID(id) + if !previousState.IsEmpty() { + state.MarkAsStored() + p.states.Update(state, info.listingID) + p.states.writeStates(p.store) + } -func (p *s3ObjectProcessor) addGzipDecoderIfNeeded(body io.Reader) (io.Reader, error) { - bufReader := bufio.NewReader(body) + if p.states.IsListingFullyStored(info.listingID) { + // locked on processing we unlock when all the object were ACKed + lock, _ := p.workersListingMap.Load(info.listingID) + lock.(*sync.Mutex).Unlock() + } - gzipped, err := isStreamGzipped(bufReader) - if err != nil { - return nil, err - } - if !gzipped { - return bufReader, nil + // Manage locks for processing. + p.workersProcessingMap.Delete(state.Id) + } } - return gzip.NewReader(bufReader) + return multierr.Combine(errs...) } -func (p *s3ObjectProcessor) readJSON(r io.Reader) error { - dec := json.NewDecoder(r) - dec.UseNumber() - - for dec.More() && p.ctx.Err() == nil { - offset := dec.InputOffset() - - var item json.RawMessage - if err := dec.Decode(&item); err != nil { - return fmt.Errorf("failed to decode json: %w", err) - } +func (p *s3Poller) getS3Objects(ctx context.Context, s3ObjectPayloadChan chan<- *s3ObjectPayload) { + defer close(s3ObjectPayloadChan) + bucketMetadata := strings.Split(p.bucket, ":") + bucketName := bucketMetadata[len(bucketMetadata)-1] - if p.readerConfig.ExpandEventListFromField != "" { - if err := p.splitEventList(p.readerConfig.ExpandEventListFromField, item, offset, p.s3ObjHash); err != nil { - return err - } + paginator := p.s3.ListObjectsPaginator(bucketName) + for paginator.Next(ctx) { + lock := new(sync.Mutex) + listingID, err := uuid.NewV4() + if err != nil { + p.log.Warnw("Error generating UUID for listing page.", "error", err) continue } + // lock for the listing page and state in workersListingMap + // this map is shared with the storedOp and will be unlocked there + lock.Lock() + p.workersListingMap.Store(listingID.String(), lock) + + totProcessableObjects := 0 + page := paginator.CurrentPage() + s3ObjectPayloadChanByPage := make(chan *s3ObjectPayload, len(page.Contents)) + for _, object := range page.Contents { + // Unescape s3 key name. For example, convert "%3D" back to "=". + filename, err := url.QueryUnescape(*object.Key) + if err != nil { + p.log.Errorw("Error when unescaping object key, skipping.", "error", err, "key", *object.Key) + continue + } - data, _ := item.MarshalJSON() - evt := createEvent(string(data), offset, p.s3Obj, p.s3ObjHash, p.s3Metadata) - p.publish(p.acker, &evt) - } - - return nil -} + state := newState(bucketName, filename, *object.ETag, *object.LastModified) + if p.states.MustSkip(state, p.store) { + p.log.Debugw("skipping state.", "state", state) + continue + } -func (p *s3ObjectProcessor) splitEventList(key string, raw json.RawMessage, offset int64, objHash string) error { - var jsonObject map[string]json.RawMessage - if err := json.Unmarshal(raw, &jsonObject); err != nil { - return err - } + p.states.Update(state, "") - raw, found := jsonObject[key] - if !found { - return fmt.Errorf("expand_event_list_from_field key <%v> is not in event", key) - } + event := s3EventV2{} + event.S3.Bucket.Name = bucketName + event.S3.Object.Key = filename - dec := json.NewDecoder(bytes.NewReader(raw)) - dec.UseNumber() + acker := newEventACKTracker(ctx) - tok, err := dec.Token() - if err != nil { - return err - } - delim, ok := tok.(json.Delim) - if !ok || delim != '[' { - return fmt.Errorf("expand_event_list_from_field <%v> is not an array", key) - } + s3Processor := p.s3ObjectHandler.Create(ctx, p.log, acker, event) + if s3Processor == nil { + continue + } - for dec.More() { - arrayOffset := dec.InputOffset() + totProcessableObjects++ - var item json.RawMessage - if err := dec.Decode(&item); err != nil { - return fmt.Errorf("failed to decode array item at offset %d: %w", offset+arrayOffset, err) + s3ObjectPayloadChanByPage <- &s3ObjectPayload{ + s3ObjectHandler: s3Processor, + s3ObjectInfo: s3ObjectInfo{ + name: bucketName, + key: filename, + etag: *object.ETag, + lastModified: *object.LastModified, + listingID: listingID.String(), + }, + s3ObjectEvent: event, + } } - data, _ := item.MarshalJSON() - evt := createEvent(string(data), offset+arrayOffset, p.s3Obj, objHash, p.s3Metadata) - p.publish(p.acker, &evt) - } - - return nil -} + listingInfo := listingInfo{totObjects: totProcessableObjects} + p.states.AddListing(listingID.String(), listingInfo) + if totProcessableObjects == 0 { + // nothing to be ACKed, unlock here + lock.Unlock() + } -func (p *s3ObjectProcessor) readFile(r io.Reader) error { - encodingFactory, ok := encoding.FindEncoding(p.readerConfig.Encoding) - if !ok || encodingFactory == nil { - return fmt.Errorf("failed to find '%v' encoding", p.readerConfig.Encoding) + close(s3ObjectPayloadChanByPage) + for s3ObjectPayload := range s3ObjectPayloadChanByPage { + s3ObjectPayloadChan <- s3ObjectPayload + } } - enc, err := encodingFactory(r) - if err != nil { - return fmt.Errorf("failed to initialize encoding: %w", err) + if err := paginator.Err(); err != nil { + p.log.Warnw("Error when paginating listing.", "error", err) } - var reader reader.Reader - reader, err = readfile.NewEncodeReader(ioutil.NopCloser(r), readfile.Config{ - Codec: enc, - BufferSize: int(p.readerConfig.BufferSize), - Terminator: p.readerConfig.LineTerminator, - MaxBytes: int(p.readerConfig.MaxBytes) * 4, - }) - if err != nil { - return fmt.Errorf("failed to create encode reader: %w", err) - } + return +} - reader = readfile.NewStripNewline(reader, p.readerConfig.LineTerminator) - reader = p.readerConfig.Parsers.Create(reader) - reader = readfile.NewLimitReader(reader, int(p.readerConfig.MaxBytes)) +func (p *s3Poller) purge() { + for _, listingID := range p.states.GetListingIDs() { + // we lock here in order to process the purge only after + // full listing page is ACKed by all the workers + lock, _ := p.workersListingMap.Load(listingID) + lock.(*sync.Mutex).Lock() - var offset int64 - for { - message, err := reader.Next() - if err == io.EOF { - // No more lines - break - } - if err != nil { - return fmt.Errorf("error reading message: %w", err) - } + keys := map[string]struct{}{} + latestStoredTimeByBucket := make(map[string]time.Time, 0) - event := createEvent(string(message.Content), offset, p.s3Obj, p.s3ObjHash, p.s3Metadata) - event.Fields.DeepUpdate(message.Fields) - offset += int64(message.Bytes) - p.publish(p.acker, &event) - } + for _, state := range p.states.GetStatesByListingID(listingID) { + // it is not stored, keep + if !state.Stored { + continue + } - return nil -} + var latestStoredTime time.Time + keys[state.Id] = struct{}{} + latestStoredTime, ok := latestStoredTimeByBucket[state.Bucket] + if !ok { + var commitWriteState commitWriteState + err := p.store.Get(awsS3WriteCommitPrefix+state.Bucket, &commitWriteState) + if err == nil { + // we have no entry in the map and we have no entry in the store + // set zero time + latestStoredTime = time.Time{} + } -func (p *s3ObjectProcessor) publish(ack *eventACKTracker, event *beat.Event) { - ack.Add(1) - event.Private = ack - p.metrics.s3EventsCreatedTotal.Inc() - p.publisher.Publish(*event) -} + latestStoredTime = commitWriteState.Time -func createEvent(message string, offset int64, obj s3EventV2, objectHash string, meta map[string]interface{}) beat.Event { - event := beat.Event{ - Timestamp: time.Now().UTC(), - Fields: common.MapStr{ - "message": message, - "log": common.MapStr{ - "offset": offset, - "file": common.MapStr{ - "path": constructObjectURL(obj), - }, - }, - "aws": common.MapStr{ - "s3": common.MapStr{ - "bucket": common.MapStr{ - "name": obj.S3.Bucket.Name, - "arn": obj.S3.Bucket.ARN}, - "object": common.MapStr{ - "key": obj.S3.Object.Key, - }, - }, - }, - "cloud": common.MapStr{ - "provider": "aws", - "region": obj.AWSRegion, - }, - }, - } - event.SetID(objectID(objectHash, offset)) + } - if len(meta) > 0 { - event.Fields.Put("aws.s3.metadata", meta) - } + if state.LastModified.After(latestStoredTime) { + latestStoredTimeByBucket[state.Bucket] = state.LastModified + } - return event -} + } -func objectID(objectHash string, offset int64) string { - return fmt.Sprintf("%s-%012d", objectHash, offset) -} + for key := range keys { + p.states.Delete(key) + } -func constructObjectURL(obj s3EventV2) string { - return "https://" + obj.S3.Bucket.Name + ".s3." + obj.AWSRegion + ".amazonaws.com/" + obj.S3.Object.Key -} + p.states.writeStates(p.store) -// s3ObjectHash returns a short sha256 hash of the bucket arn + object key name. -func s3ObjectHash(obj s3EventV2) string { - h := sha256.New() - h.Write([]byte(obj.S3.Bucket.ARN)) - h.Write([]byte(obj.S3.Object.Key)) - prefix := hex.EncodeToString(h.Sum(nil)) - return prefix[:10] -} + for bucket, latestStoredTime := range latestStoredTimeByBucket { + if err := p.store.Set(awsS3WriteCommitPrefix+bucket, commitWriteState{latestStoredTime}); err != nil { + p.log.Errorw("Failed to write commit time to the registry", "error", err) + } + } -// isStreamGzipped determines whether the given stream of bytes (encapsulated in a buffered reader) -// represents gzipped content or not. A buffered reader is used so the function can peek into the byte -// stream without consuming it. This makes it convenient for code executed after this function call -// to consume the stream if it wants. -func isStreamGzipped(r *bufio.Reader) (bool, error) { - // Why 512? See https://godoc.org/net/http#DetectContentType - buf, err := r.Peek(512) - if err != nil && err != io.EOF { - return false, err + // workersListingMap map is shared with the storedop + // purge is done, we can unlock and clean + lock.(*sync.Mutex).Unlock() + p.workersListingMap.Delete(listingID) + p.states.DeleteListing(listingID) } - switch http.DetectContentType(buf) { - case "application/x-gzip", "application/zip": - return true, nil - default: - return false, nil - } + return } -// s3Metadata returns a map containing the selected S3 object metadata keys. -func s3Metadata(resp *s3.GetObjectResponse, keys ...string) common.MapStr { - if len(keys) == 0 { - return nil - } +func (p *s3Poller) Poll(ctx context.Context) error { + // This loop tries to keep the workers busy as much as possible while + // honoring the number in config opposed to a simpler loop that does one + // listing, sequentially processes every object and then does another listing + workerWg := new(sync.WaitGroup) + for ctx.Err() == nil { - // When you upload objects using the REST API, the optional user-defined - // metadata names must begin with "x-amz-meta-" to distinguish them from - // other HTTP headers. - const userMetaPrefix = "x-amz-meta-" - - allMeta := map[string]interface{}{} - - // Get headers using AWS SDK struct tags. - fields := reflect.TypeOf(resp.GetObjectOutput).Elem() - values := reflect.ValueOf(resp.GetObjectOutput).Elem() - for i := 0; i < fields.NumField(); i++ { - f := fields.Field(i) - - if loc, _ := f.Tag.Lookup("location"); loc != "header" { - continue + // Determine how many S3 workers are available. + workers, err := p.workerSem.AcquireContext(p.numberOfWorkers, ctx) + if err != nil { + break } - name, found := f.Tag.Lookup("locationName") - if !found { - continue - } - name = strings.ToLower(name) + s3ObjectPayloadChan := make(chan *s3ObjectPayload) - if name == userMetaPrefix { - continue - } + workerWg.Add(1) + go func() { + defer workerWg.Done() + p.getS3Objects(ctx, s3ObjectPayloadChan) + }() - v := values.Field(i) - switch v.Kind() { - case reflect.Ptr: - if v.IsNil() { - continue - } - v = v.Elem() - default: - if v.IsZero() { - continue - } + workerWg.Add(workers) + for i := 0; i < workers; i++ { + go func() { + defer func() { + workerWg.Done() + p.workerSem.Release(1) + }() + if err := p.process(s3ObjectPayloadChan); err != nil { + p.log.Warnw("Failed processing S3 listing.", "error", err) + } + }() } - allMeta[name] = v.Interface() - } + p.purge() - // Add in the user defined headers. - for k, v := range resp.Metadata { - k = strings.ToLower(k) - allMeta[userMetaPrefix+k] = v + <-time.After(p.bucketPollInterval) } - // Select the matching headers from the config. - metadata := common.MapStr{} - for _, key := range keys { - key = strings.ToLower(key) + // Wait for all workers to finish. + workerWg.Wait() - v, found := allMeta[key] - if !found { - continue - } - - metadata[key] = v + if errors.Is(ctx.Err(), context.Canceled) { + // A canceled context is a normal shutdown. + return nil } - - return metadata + return ctx.Err() } diff --git a/x-pack/filebeat/input/awss3/s3_objects.go b/x-pack/filebeat/input/awss3/s3_objects.go new file mode 100644 index 000000000000..4841bebdaad9 --- /dev/null +++ b/x-pack/filebeat/input/awss3/s3_objects.go @@ -0,0 +1,453 @@ +// Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one +// or more contributor license agreements. Licensed under the Elastic License; +// you may not use this file except in compliance with the Elastic License. + +package awss3 + +import ( + "bufio" + "bytes" + "compress/gzip" + "context" + "crypto/sha256" + "encoding/hex" + "encoding/json" + "fmt" + "io" + "io/ioutil" + "net/http" + "reflect" + "strings" + "time" + + "github.com/aws/aws-sdk-go-v2/service/s3" + "github.com/pkg/errors" + + "github.com/elastic/beats/v7/libbeat/beat" + "github.com/elastic/beats/v7/libbeat/common" + "github.com/elastic/beats/v7/libbeat/logp" + "github.com/elastic/beats/v7/libbeat/monitoring" + "github.com/elastic/beats/v7/libbeat/reader" + "github.com/elastic/beats/v7/libbeat/reader/readfile" + "github.com/elastic/beats/v7/libbeat/reader/readfile/encoding" +) + +const ( + contentTypeJSON = "application/json" + contentTypeNDJSON = "application/x-ndjson" +) + +type s3ObjectProcessorFactory struct { + log *logp.Logger + metrics *inputMetrics + s3 s3Getter + publisher beat.Client + fileSelectors []fileSelectorConfig +} + +func newS3ObjectProcessorFactory(log *logp.Logger, metrics *inputMetrics, s3 s3Getter, publisher beat.Client, sel []fileSelectorConfig) *s3ObjectProcessorFactory { + if metrics == nil { + metrics = newInputMetrics(monitoring.NewRegistry(), "") + } + if len(sel) == 0 { + sel = []fileSelectorConfig{ + {ReaderConfig: defaultConfig().ReaderConfig}, + } + } + return &s3ObjectProcessorFactory{ + log: log, + metrics: metrics, + s3: s3, + publisher: publisher, + fileSelectors: sel, + } +} + +func (f *s3ObjectProcessorFactory) findReaderConfig(key string) *readerConfig { + for _, sel := range f.fileSelectors { + if sel.Regex == nil || sel.Regex.MatchString(key) { + return &sel.ReaderConfig + } + } + return nil +} + +// Create returns a new s3ObjectProcessor. It returns nil when no file selectors +// match the S3 object key. +func (f *s3ObjectProcessorFactory) Create(ctx context.Context, log *logp.Logger, ack *eventACKTracker, obj s3EventV2) s3ObjectHandler { + log = log.With( + "s3_bucket", obj.S3.Bucket.Name, + "s3_object", obj.S3.Object.Key) + + readerConfig := f.findReaderConfig(obj.S3.Object.Key) + if readerConfig == nil { + log.Debug("Skipping S3 object processing. No file_selectors are a match.") + return nil + } + + return &s3ObjectProcessor{ + s3ObjectProcessorFactory: f, + log: log, + ctx: ctx, + acker: ack, + readerConfig: readerConfig, + s3Obj: obj, + s3ObjHash: s3ObjectHash(obj), + } +} + +type s3ObjectProcessor struct { + *s3ObjectProcessorFactory + + log *logp.Logger + ctx context.Context + acker *eventACKTracker // ACKer tied to the SQS message (multiple S3 readers share an ACKer when the S3 notification event contains more than one S3 object). + readerConfig *readerConfig // Config about how to process the object. + s3Obj s3EventV2 // S3 object information. + s3ObjHash string + + s3Metadata map[string]interface{} // S3 object metadata. +} + +func (p *s3ObjectProcessor) Wait() { + p.acker.Wait() +} + +func (p *s3ObjectProcessor) ProcessS3Object() error { + if p == nil { + return nil + } + + // Metrics and Logging + { + p.log.Debug("Begin S3 object processing.") + p.metrics.s3ObjectsRequestedTotal.Inc() + p.metrics.s3ObjectsInflight.Inc() + start := time.Now() + defer func() { + elapsed := time.Since(start) + p.metrics.s3ObjectsInflight.Dec() + p.metrics.s3ObjectProcessingTime.Update(elapsed.Nanoseconds()) + p.log.Debugw("End S3 object processing.", "elapsed_time_ns", elapsed) + }() + } + + // Request object (download). + contentType, meta, body, err := p.download() + if err != nil { + return errors.Wrap(err, "failed to get s3 object") + } + defer body.Close() + p.s3Metadata = meta + + reader, err := p.addGzipDecoderIfNeeded(newMonitoredReader(body, p.metrics.s3BytesProcessedTotal)) + if err != nil { + return errors.Wrap(err, "failed checking for gzip content") + } + + // Overwrite with user configured Content-Type. + if p.readerConfig.ContentType != "" { + contentType = p.readerConfig.ContentType + } + + // Process object content stream. + switch { + case contentType == contentTypeJSON || contentType == contentTypeNDJSON: + err = p.readJSON(reader) + default: + err = p.readFile(reader) + } + if err != nil { + return err + } + + return nil +} + +// download requests the S3 object from AWS and returns the object's +// Content-Type and reader to get the object's contents. The caller must +// close the returned reader. +func (p *s3ObjectProcessor) download() (contentType string, metadata map[string]interface{}, body io.ReadCloser, err error) { + resp, err := p.s3.GetObject(p.ctx, p.s3Obj.S3.Bucket.Name, p.s3Obj.S3.Object.Key) + if err != nil { + return "", nil, nil, err + } + meta := s3Metadata(resp, p.readerConfig.IncludeS3Metadata...) + return *resp.ContentType, meta, resp.Body, nil +} + +func (p *s3ObjectProcessor) addGzipDecoderIfNeeded(body io.Reader) (io.Reader, error) { + bufReader := bufio.NewReader(body) + + gzipped, err := isStreamGzipped(bufReader) + if err != nil { + return nil, err + } + if !gzipped { + return bufReader, nil + } + + return gzip.NewReader(bufReader) +} + +func (p *s3ObjectProcessor) readJSON(r io.Reader) error { + dec := json.NewDecoder(r) + dec.UseNumber() + + for dec.More() && p.ctx.Err() == nil { + offset := dec.InputOffset() + + var item json.RawMessage + if err := dec.Decode(&item); err != nil { + return fmt.Errorf("failed to decode json: %w", err) + } + + if p.readerConfig.ExpandEventListFromField != "" { + if err := p.splitEventList(p.readerConfig.ExpandEventListFromField, item, offset, p.s3ObjHash); err != nil { + return err + } + continue + } + + data, _ := item.MarshalJSON() + evt := createEvent(string(data), offset, p.s3Obj, p.s3ObjHash, p.s3Metadata) + p.publish(p.acker, &evt) + } + + return nil +} + +func (p *s3ObjectProcessor) splitEventList(key string, raw json.RawMessage, offset int64, objHash string) error { + var jsonObject map[string]json.RawMessage + if err := json.Unmarshal(raw, &jsonObject); err != nil { + return err + } + + raw, found := jsonObject[key] + if !found { + return fmt.Errorf("expand_event_list_from_field key <%v> is not in event", key) + } + + dec := json.NewDecoder(bytes.NewReader(raw)) + dec.UseNumber() + + tok, err := dec.Token() + if err != nil { + return err + } + delim, ok := tok.(json.Delim) + if !ok || delim != '[' { + return fmt.Errorf("expand_event_list_from_field <%v> is not an array", key) + } + + for dec.More() { + arrayOffset := dec.InputOffset() + + var item json.RawMessage + if err := dec.Decode(&item); err != nil { + return fmt.Errorf("failed to decode array item at offset %d: %w", offset+arrayOffset, err) + } + + data, _ := item.MarshalJSON() + evt := createEvent(string(data), offset+arrayOffset, p.s3Obj, objHash, p.s3Metadata) + p.publish(p.acker, &evt) + } + + return nil +} + +func (p *s3ObjectProcessor) readFile(r io.Reader) error { + encodingFactory, ok := encoding.FindEncoding(p.readerConfig.Encoding) + if !ok || encodingFactory == nil { + return fmt.Errorf("failed to find '%v' encoding", p.readerConfig.Encoding) + } + + enc, err := encodingFactory(r) + if err != nil { + return fmt.Errorf("failed to initialize encoding: %w", err) + } + + var reader reader.Reader + reader, err = readfile.NewEncodeReader(ioutil.NopCloser(r), readfile.Config{ + Codec: enc, + BufferSize: int(p.readerConfig.BufferSize), + Terminator: p.readerConfig.LineTerminator, + MaxBytes: int(p.readerConfig.MaxBytes) * 4, + }) + if err != nil { + return fmt.Errorf("failed to create encode reader: %w", err) + } + + reader = readfile.NewStripNewline(reader, p.readerConfig.LineTerminator) + reader = p.readerConfig.Parsers.Create(reader) + reader = readfile.NewLimitReader(reader, int(p.readerConfig.MaxBytes)) + + var offset int64 + for { + message, err := reader.Next() + if err == io.EOF { + // No more lines + break + } + if err != nil { + return fmt.Errorf("error reading message: %w", err) + } + + event := createEvent(string(message.Content), offset, p.s3Obj, p.s3ObjHash, p.s3Metadata) + event.Fields.DeepUpdate(message.Fields) + offset += int64(message.Bytes) + p.publish(p.acker, &event) + } + + return nil +} + +func (p *s3ObjectProcessor) publish(ack *eventACKTracker, event *beat.Event) { + ack.Add(1) + event.Private = ack + p.metrics.s3EventsCreatedTotal.Inc() + p.publisher.Publish(*event) +} + +func createEvent(message string, offset int64, obj s3EventV2, objectHash string, meta map[string]interface{}) beat.Event { + event := beat.Event{ + Timestamp: time.Now().UTC(), + Fields: common.MapStr{ + "message": message, + "log": common.MapStr{ + "offset": offset, + "file": common.MapStr{ + "path": constructObjectURL(obj), + }, + }, + "aws": common.MapStr{ + "s3": common.MapStr{ + "bucket": common.MapStr{ + "name": obj.S3.Bucket.Name, + "arn": obj.S3.Bucket.ARN}, + "object": common.MapStr{ + "key": obj.S3.Object.Key, + }, + }, + }, + "cloud": common.MapStr{ + "provider": "aws", + "region": obj.AWSRegion, + }, + }, + } + event.SetID(objectID(objectHash, offset)) + + if len(meta) > 0 { + event.Fields.Put("aws.s3.metadata", meta) + } + + return event +} + +func objectID(objectHash string, offset int64) string { + return fmt.Sprintf("%s-%012d", objectHash, offset) +} + +func constructObjectURL(obj s3EventV2) string { + return "https://" + obj.S3.Bucket.Name + ".s3." + obj.AWSRegion + ".amazonaws.com/" + obj.S3.Object.Key +} + +// s3ObjectHash returns a short sha256 hash of the bucket arn + object key name. +func s3ObjectHash(obj s3EventV2) string { + h := sha256.New() + h.Write([]byte(obj.S3.Bucket.ARN)) + h.Write([]byte(obj.S3.Object.Key)) + prefix := hex.EncodeToString(h.Sum(nil)) + return prefix[:10] +} + +// isStreamGzipped determines whether the given stream of bytes (encapsulated in a buffered reader) +// represents gzipped content or not. A buffered reader is used so the function can peek into the byte +// stream without consuming it. This makes it convenient for code executed after this function call +// to consume the stream if it wants. +func isStreamGzipped(r *bufio.Reader) (bool, error) { + // Why 512? See https://godoc.org/net/http#DetectContentType + buf, err := r.Peek(512) + if err != nil && err != io.EOF { + return false, err + } + + switch http.DetectContentType(buf) { + case "application/x-gzip", "application/zip": + return true, nil + default: + return false, nil + } +} + +// s3Metadata returns a map containing the selected S3 object metadata keys. +func s3Metadata(resp *s3.GetObjectResponse, keys ...string) common.MapStr { + if len(keys) == 0 { + return nil + } + + // When you upload objects using the REST API, the optional user-defined + // metadata names must begin with "x-amz-meta-" to distinguish them from + // other HTTP headers. + const userMetaPrefix = "x-amz-meta-" + + allMeta := map[string]interface{}{} + + // Get headers using AWS SDK struct tags. + fields := reflect.TypeOf(resp.GetObjectOutput).Elem() + values := reflect.ValueOf(resp.GetObjectOutput).Elem() + for i := 0; i < fields.NumField(); i++ { + f := fields.Field(i) + + if loc, _ := f.Tag.Lookup("location"); loc != "header" { + continue + } + + name, found := f.Tag.Lookup("locationName") + if !found { + continue + } + name = strings.ToLower(name) + + if name == userMetaPrefix { + continue + } + + v := values.Field(i) + switch v.Kind() { + case reflect.Ptr: + if v.IsNil() { + continue + } + v = v.Elem() + default: + if v.IsZero() { + continue + } + } + + allMeta[name] = v.Interface() + } + + // Add in the user defined headers. + for k, v := range resp.Metadata { + k = strings.ToLower(k) + allMeta[userMetaPrefix+k] = v + } + + // Select the matching headers from the config. + metadata := common.MapStr{} + for _, key := range keys { + key = strings.ToLower(key) + + v, found := allMeta[key] + if !found { + continue + } + + metadata[key] = v + } + + return metadata +} diff --git a/x-pack/filebeat/input/awss3/s3_objects_test.go b/x-pack/filebeat/input/awss3/s3_objects_test.go new file mode 100644 index 000000000000..6cf1ea1fa5a8 --- /dev/null +++ b/x-pack/filebeat/input/awss3/s3_objects_test.go @@ -0,0 +1,261 @@ +// Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one +// or more contributor license agreements. Licensed under the Elastic License; +// you may not use this file except in compliance with the Elastic License. + +package awss3 + +import ( + "bytes" + "context" + "errors" + "io/ioutil" + "path/filepath" + "strings" + "testing" + + awssdk "github.com/aws/aws-sdk-go-v2/aws" + "github.com/aws/aws-sdk-go-v2/service/s3" + "github.com/golang/mock/gomock" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + + "github.com/elastic/beats/v7/libbeat/beat" + "github.com/elastic/beats/v7/libbeat/common" + "github.com/elastic/beats/v7/libbeat/logp" +) + +func newS3Object(t testing.TB, filename, contentType string) (s3EventV2, *s3.GetObjectResponse) { + data, err := ioutil.ReadFile(filename) + if err != nil { + t.Fatal(err) + } + + return newS3Event(filename), newS3GetObjectResponse(filename, data, contentType) +} + +func newS3GetObjectResponse(filename string, data []byte, contentType string) *s3.GetObjectResponse { + r := bytes.NewReader(data) + contentLen := int64(r.Len()) + resp := &s3.GetObjectResponse{ + GetObjectOutput: &s3.GetObjectOutput{ + Body: ioutil.NopCloser(r), + ContentLength: &contentLen, + ContentType: &contentType, + }, + } + switch strings.ToLower(filepath.Ext(filename)) { + case ".gz": + gzipEncoding := "gzip" + resp.ContentEncoding = &gzipEncoding + } + return resp +} + +func TestS3ObjectProcessor(t *testing.T) { + logp.TestingSetup() + + t.Run("download text/plain file", func(t *testing.T) { + testProcessS3Object(t, "testdata/log.txt", "text/plain", 2) + }) + + t.Run("multiline content", func(t *testing.T) { + sel := fileSelectorConfig{ReaderConfig: readerConfig{}} + sel.ReaderConfig.InitDefaults() + + // Unfortunately the config structs for the parser package are not + // exported to use config parsing. + cfg := common.MustNewConfigFrom(map[string]interface{}{ + "parsers": []map[string]interface{}{ + { + "multiline": map[string]interface{}{ + "pattern": "^ len(s3Objects) { + endIdx = len(s3Objects) + } + return &s3.ListObjectsOutput{ + Contents: s3Objects[startIdx:endIdx], + } + }) + mockS3Pager.EXPECT().Err().Return(nil) + + return mockS3Pager +} diff --git a/x-pack/filebeat/input/awss3/s3_test.go b/x-pack/filebeat/input/awss3/s3_test.go index 6cf1ea1fa5a8..a98e51ccaef8 100644 --- a/x-pack/filebeat/input/awss3/s3_test.go +++ b/x-pack/filebeat/input/awss3/s3_test.go @@ -5,257 +5,266 @@ package awss3 import ( - "bytes" "context" - "errors" - "io/ioutil" - "path/filepath" - "strings" "testing" + "time" + + "github.com/elastic/beats/v7/libbeat/statestore" + "github.com/elastic/beats/v7/libbeat/statestore/storetest" + + "github.com/aws/aws-sdk-go-v2/aws" - awssdk "github.com/aws/aws-sdk-go-v2/aws" "github.com/aws/aws-sdk-go-v2/service/s3" + "github.com/golang/mock/gomock" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" - "github.com/elastic/beats/v7/libbeat/beat" - "github.com/elastic/beats/v7/libbeat/common" "github.com/elastic/beats/v7/libbeat/logp" ) -func newS3Object(t testing.TB, filename, contentType string) (s3EventV2, *s3.GetObjectResponse) { - data, err := ioutil.ReadFile(filename) - if err != nil { - t.Fatal(err) - } - - return newS3Event(filename), newS3GetObjectResponse(filename, data, contentType) -} - -func newS3GetObjectResponse(filename string, data []byte, contentType string) *s3.GetObjectResponse { - r := bytes.NewReader(data) - contentLen := int64(r.Len()) - resp := &s3.GetObjectResponse{ - GetObjectOutput: &s3.GetObjectOutput{ - Body: ioutil.NopCloser(r), - ContentLength: &contentLen, - ContentType: &contentType, - }, - } - switch strings.ToLower(filepath.Ext(filename)) { - case ".gz": - gzipEncoding := "gzip" - resp.ContentEncoding = &gzipEncoding - } - return resp -} - -func TestS3ObjectProcessor(t *testing.T) { +func TestS3Poller(t *testing.T) { logp.TestingSetup() + const bucket = "bucket" + const numberOfWorkers = 5 + const pollInterval = 10 * time.Second + const testTimeout = 5 * time.Millisecond + + t.Run("Poll success", func(t *testing.T) { + storeReg := statestore.NewRegistry(storetest.NewMemoryStoreBackend()) + store, err := storeReg.Get("test") + if err != nil { + t.Fatalf("Failed to access store: %v", err) + } - t.Run("download text/plain file", func(t *testing.T) { - testProcessS3Object(t, "testdata/log.txt", "text/plain", 2) - }) - - t.Run("multiline content", func(t *testing.T) { - sel := fileSelectorConfig{ReaderConfig: readerConfig{}} - sel.ReaderConfig.InitDefaults() - - // Unfortunately the config structs for the parser package are not - // exported to use config parsing. - cfg := common.MustNewConfigFrom(map[string]interface{}{ - "parsers": []map[string]interface{}{ - { - "multiline": map[string]interface{}{ - "pattern": "^ len(s3Objects) { - endIdx = len(s3Objects) - } - return &s3.ListObjectsOutput{ - Contents: s3Objects[startIdx:endIdx], - } + s3ObjProc := newS3ObjectProcessorFactory(logp.NewLogger(inputName), nil, mockAPI, mockPublisher, nil) + receiver := newS3Poller(logp.NewLogger(inputName), nil, mockAPI, s3ObjProc, newStates(), store, bucket, numberOfWorkers, pollInterval) + require.Error(t, context.DeadlineExceeded, receiver.Poll(ctx)) + assert.Equal(t, numberOfWorkers, receiver.workerSem.available) }) - mockS3Pager.EXPECT().Err().Return(nil) - - return mockS3Pager } diff --git a/x-pack/filebeat/input/awss3/state.go b/x-pack/filebeat/input/awss3/state.go index aefd59414b1f..0ace11ea6923 100644 --- a/x-pack/filebeat/input/awss3/state.go +++ b/x-pack/filebeat/input/awss3/state.go @@ -9,14 +9,16 @@ import ( "time" ) -// state is used to communicate the reading state of a file +// state is used to communicate the publishing state of a s3 object type state struct { Id string `json:"id" struct:"id"` Bucket string `json:"bucket" struct:"bucket"` Key string `json:"key" struct:"key"` Etag string `json:"etag" struct:"etag"` LastModified time.Time `json:"last_modified" struct:"last_modifed"` - Stored bool `json:"stored" struct:"stored"` + + // A state has Stored = true when all events are ACKed. + Stored bool `json:"stored" struct:"stored"` } // newState creates a new s3 object state @@ -39,7 +41,7 @@ func (s *state) MarkAsStored() { s.Stored = true } -// IsEqual checks if the two states point to the same file. +// IsEqual checks if the two states point to the same s3 object. func (s *state) IsEqual(c *state) bool { return s.Bucket == c.Bucket && s.Key == c.Key && s.Etag == c.Etag && s.LastModified.Equal(c.LastModified) } diff --git a/x-pack/filebeat/input/awss3/states.go b/x-pack/filebeat/input/awss3/states.go index de7abacf996b..a13230f9e25c 100644 --- a/x-pack/filebeat/input/awss3/states.go +++ b/x-pack/filebeat/input/awss3/states.go @@ -18,17 +18,6 @@ const ( awsS3WriteCommitPrefix = "filebeat::aws-s3::writeCommit::" ) -// storedOp keeps track of pending updates that are not written to the persistent store yet. -// Update operations are ordered. The input manager guarantees that only one -// input can create update operation for a source, such that new input -// instances can add update operations to be executed after already pending -// update operations from older inputs instances that have been shutdown. -type storedOp struct { - states *states - store *statestore.Store - lockMap *sync.Map -} - type listingInfo struct { totObjects int storedObjects int @@ -61,6 +50,36 @@ func newStates() *states { } } +func (s *states) MustSkip(state state, store *statestore.Store) bool { + if !s.IsNew(state) { + // here we should purge from the store + s.Delete(state.Id) + s.writeStates(store) + + return true + } + + previousState := s.FindPrevious(state) + + // status is forget. if there is no previous state and + // the state.LastModified is before the last cleanStore + // write commit we can remove + var commitWriteState commitWriteState + err := store.Get(awsS3WriteCommitPrefix+state.Bucket, &commitWriteState) + if err == nil && previousState.IsEmpty() && + (state.LastModified.Before(commitWriteState.Time) || state.LastModified.Equal(commitWriteState.Time)) { + return true + } + + // we have no previous state or the previous state + // is not stored: refresh the state + if previousState.IsEmpty() || !previousState.Stored { + s.Update(state, "") + } + + return false +} + func (s *states) Delete(id string) { s.Lock() defer s.Unlock() @@ -115,7 +134,7 @@ func (s *states) Update(newState state, listingID string) { // No existing state found, add new one s.idx[id] = len(s.states) s.states = append(s.states, newState) - logp.Debug("input", "New state added for %s", newState.Key) + logp.Debug("input", "New state added for %s", newState.Id) } if listingID == "" || !newState.Stored { diff --git a/x-pack/filebeat/input/default-inputs/inputs.go b/x-pack/filebeat/input/default-inputs/inputs.go index a3381cb42d0b..7fc3737e37da 100644 --- a/x-pack/filebeat/input/default-inputs/inputs.go +++ b/x-pack/filebeat/input/default-inputs/inputs.go @@ -30,6 +30,6 @@ func xpackInputs(info beat.Info, log *logp.Logger, store beater.StateStore) []v2 http_endpoint.Plugin(), httpjson.Plugin(log, store), o365audit.Plugin(log, store), - awss3.Plugin(), + awss3.Plugin(store), } } From 4ed3e5ab96aecb076b2cf6785cee4786dcca21ed Mon Sep 17 00:00:00 2001 From: Andrea Spacca Date: Thu, 12 Aug 2021 18:59:23 +0200 Subject: [PATCH 08/20] make update --- x-pack/filebeat/input/awss3/mock_interfaces_test.go | 7 ++++++- x-pack/filebeat/input/awss3/mock_publisher_test.go | 7 ++++++- 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/x-pack/filebeat/input/awss3/mock_interfaces_test.go b/x-pack/filebeat/input/awss3/mock_interfaces_test.go index d05381786aab..1929fa7c9ec3 100644 --- a/x-pack/filebeat/input/awss3/mock_interfaces_test.go +++ b/x-pack/filebeat/input/awss3/mock_interfaces_test.go @@ -1,3 +1,7 @@ +// Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one +// or more contributor license agreements. Licensed under the Elastic License; +// you may not use this file except in compliance with the Elastic License. + // Code generated by MockGen. DO NOT EDIT. // Source: interfaces.go @@ -11,8 +15,9 @@ import ( s3 "github.com/aws/aws-sdk-go-v2/service/s3" sqs "github.com/aws/aws-sdk-go-v2/service/sqs" - logp "github.com/elastic/beats/v7/libbeat/logp" gomock "github.com/golang/mock/gomock" + + logp "github.com/elastic/beats/v7/libbeat/logp" ) // MockSQSAPI is a mock of sqsAPI interface. diff --git a/x-pack/filebeat/input/awss3/mock_publisher_test.go b/x-pack/filebeat/input/awss3/mock_publisher_test.go index 40c46062a38d..7fa935496aad 100644 --- a/x-pack/filebeat/input/awss3/mock_publisher_test.go +++ b/x-pack/filebeat/input/awss3/mock_publisher_test.go @@ -1,3 +1,7 @@ +// Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one +// or more contributor license agreements. Licensed under the Elastic License; +// you may not use this file except in compliance with the Elastic License. + // Code generated by MockGen. DO NOT EDIT. // Source: github.com/elastic/beats/v7/libbeat/beat (interfaces: Client) @@ -7,8 +11,9 @@ package awss3 import ( reflect "reflect" - beat "github.com/elastic/beats/v7/libbeat/beat" gomock "github.com/golang/mock/gomock" + + beat "github.com/elastic/beats/v7/libbeat/beat" ) // MockBeatClient is a mock of Client interface. From a1f9e53dd9f727aeb26ede3fb4979a3af6b43d7c Mon Sep 17 00:00:00 2001 From: Andrea Spacca Date: Fri, 13 Aug 2021 10:53:25 +0200 Subject: [PATCH 09/20] benchmark, bugfix and metrics --- libbeat/publisher/testing/testing.go | 16 +- x-pack/filebeat/input/awss3/acker.go | 4 +- x-pack/filebeat/input/awss3/acker_test.go | 4 +- .../input/awss3/input_benchmark_test.go | 210 ++++++++++++++++-- x-pack/filebeat/input/awss3/metrics.go | 6 + x-pack/filebeat/input/awss3/s3.go | 88 ++++---- x-pack/filebeat/input/awss3/s3_objects.go | 2 +- x-pack/filebeat/input/awss3/states.go | 28 +-- 8 files changed, 283 insertions(+), 75 deletions(-) diff --git a/libbeat/publisher/testing/testing.go b/libbeat/publisher/testing/testing.go index 5b5e592d69ef..0c64e4601d5e 100644 --- a/libbeat/publisher/testing/testing.go +++ b/libbeat/publisher/testing/testing.go @@ -28,8 +28,9 @@ type TestPublisher struct { // given channel only. type ChanClient struct { - done chan struct{} - Channel chan beat.Event + done chan struct{} + Channel chan beat.Event + publishCallback func(event beat.Event) } func PublisherWithClient(client beat.Client) beat.Pipeline { @@ -44,6 +45,13 @@ func (pub *TestPublisher) ConnectWith(_ beat.ClientConfig) (beat.Client, error) return pub.client, nil } +func NewChanClientWithCallback(bufSize int, callback func(event beat.Event)) *ChanClient { + chanClient := NewChanClientWith(make(chan beat.Event, bufSize)) + chanClient.publishCallback = callback + + return chanClient +} + func NewChanClient(bufSize int) *ChanClient { return NewChanClientWith(make(chan beat.Event, bufSize)) } @@ -70,6 +78,10 @@ func (c *ChanClient) Publish(event beat.Event) { select { case <-c.done: case c.Channel <- event: + if c.publishCallback != nil { + c.publishCallback(event) + <-c.Channel + } } } diff --git a/x-pack/filebeat/input/awss3/acker.go b/x-pack/filebeat/input/awss3/acker.go index c9ab9a755049..2c4cc39ee682 100644 --- a/x-pack/filebeat/input/awss3/acker.go +++ b/x-pack/filebeat/input/awss3/acker.go @@ -28,8 +28,8 @@ func newEventACKTracker(ctx context.Context) *eventACKTracker { return &eventACKTracker{ctx: ctx, cancel: cancel} } -// Add increments the number of pending ACKs by the specified amount. -func (a *eventACKTracker) Add(messageCount int64) { +// Add increments the number of pending ACKs +func (a *eventACKTracker) Add() { a.Lock() a.pendingACKs++ a.Unlock() diff --git a/x-pack/filebeat/input/awss3/acker_test.go b/x-pack/filebeat/input/awss3/acker_test.go index 3a96997f9e9f..9234479e9850 100644 --- a/x-pack/filebeat/input/awss3/acker_test.go +++ b/x-pack/filebeat/input/awss3/acker_test.go @@ -18,7 +18,7 @@ func TestEventACKTracker(t *testing.T) { t.Cleanup(cancel) acker := newEventACKTracker(ctx) - acker.Add(1) + acker.Add() acker.ACK() assert.EqualValues(t, 0, acker.pendingACKs) @@ -42,7 +42,7 @@ func TestEventACKHandler(t *testing.T) { // Create acker. Add one pending ACK. acker := newEventACKTracker(ctx) - acker.Add(1) + acker.Add() // Create an ACK handler and simulate one ACKed event. ackHandler := newEventACKHandler() diff --git a/x-pack/filebeat/input/awss3/input_benchmark_test.go b/x-pack/filebeat/input/awss3/input_benchmark_test.go index 50223b8fc75d..6ab04ca630fe 100644 --- a/x-pack/filebeat/input/awss3/input_benchmark_test.go +++ b/x-pack/filebeat/input/awss3/input_benchmark_test.go @@ -14,6 +14,10 @@ import ( "testing" "time" + "github.com/elastic/beats/v7/libbeat/beat" + + "github.com/aws/aws-sdk-go-v2/aws" + "github.com/aws/aws-sdk-go-v2/service/s3" "github.com/aws/aws-sdk-go-v2/service/sqs" "github.com/dustin/go-humanize" @@ -24,9 +28,12 @@ import ( "github.com/elastic/beats/v7/libbeat/logp" "github.com/elastic/beats/v7/libbeat/monitoring" pubtest "github.com/elastic/beats/v7/libbeat/publisher/testing" + "github.com/elastic/beats/v7/libbeat/statestore" + "github.com/elastic/beats/v7/libbeat/statestore/storetest" ) const cloudtrailTestFile = "testdata/aws-cloudtrail.json.gz" +const totListingObjects = 10000 type constantSQS struct { msgs []sqs.Message @@ -54,13 +61,62 @@ func (_ *constantSQS) ChangeMessageVisibility(ctx context.Context, msg *sqs.Mess return nil } +type s3PagerConstant struct { + objects []s3.Object + currentIndex int +} + +var _ s3Pager = (*s3PagerConstant)(nil) + +func (c *s3PagerConstant) Next(ctx context.Context) bool { + return c.currentIndex < len(c.objects) +} + +func (c *s3PagerConstant) CurrentPage() *s3.ListObjectsOutput { + ret := &s3.ListObjectsOutput{} + pageSize := 1000 + if len(c.objects) < c.currentIndex+pageSize { + pageSize = len(c.objects) - c.currentIndex + } + + ret.Contents = c.objects[c.currentIndex : c.currentIndex+pageSize] + c.currentIndex = c.currentIndex + pageSize + + return ret +} + +func (c *s3PagerConstant) Err() error { + if c.currentIndex >= len(c.objects) { + c.currentIndex = 0 + } + return nil +} + +func newS3PagerConstant() *s3PagerConstant { + lastModified := time.Now() + ret := &s3PagerConstant{ + currentIndex: 0, + } + + for i := 0; i < totListingObjects; i++ { + ret.objects = append(ret.objects, s3.Object{ + Key: aws.String(fmt.Sprintf("key-%d.json.gz", i)), + ETag: aws.String(fmt.Sprintf("etag-%d", i)), + LastModified: aws.Time(lastModified), + }) + } + + return ret +} + type constantS3 struct { - filename string - data []byte - contentType string + filename string + data []byte + contentType string + pagerConstant s3Pager } -var _ s3Getter = (*constantS3)(nil) +var _ s3API = (*constantS3)(nil) func newConstantS3(t testing.TB) *constantS3 { data, err := ioutil.ReadFile(cloudtrailTestFile) @@ -79,6 +135,10 @@ func (c constantS3) GetObject(ctx context.Context, bucket, key string) (*s3.GetO return newS3GetObjectResponse(c.filename, c.data, c.contentType), nil } +func (c constantS3) ListObjectsPaginator(bucket string) s3Pager { + return c.pagerConstant +} + func makeBenchmarkConfig(t testing.TB) config { cfg := common.MustNewConfigFrom(`--- queue_url: foo @@ -95,7 +155,7 @@ file_selectors: return inputConfig } -func benchmarkInput(t *testing.T, maxMessagesInflight int) testing.BenchmarkResult { +func benchmarkInputSQS(t *testing.T, maxMessagesInflight int) testing.BenchmarkResult { return testing.Benchmark(func(b *testing.B) { log := logp.NewLogger(inputName) metricRegistry := monitoring.NewRegistry() @@ -151,21 +211,21 @@ func benchmarkInput(t *testing.T, maxMessagesInflight int) testing.BenchmarkResu }) } -func TestBenchmarkInput(t *testing.T) { +func TestBenchmarkInputSQS(t *testing.T) { logp.TestingSetup(logp.WithLevel(logp.InfoLevel)) results := []testing.BenchmarkResult{ - benchmarkInput(t, 1), - benchmarkInput(t, 2), - benchmarkInput(t, 4), - benchmarkInput(t, 8), - benchmarkInput(t, 16), - benchmarkInput(t, 32), - benchmarkInput(t, 64), - benchmarkInput(t, 128), - benchmarkInput(t, 256), - benchmarkInput(t, 512), - benchmarkInput(t, 1024), + benchmarkInputSQS(t, 1), + benchmarkInputSQS(t, 2), + benchmarkInputSQS(t, 4), + benchmarkInputSQS(t, 8), + benchmarkInputSQS(t, 16), + benchmarkInputSQS(t, 32), + benchmarkInputSQS(t, 64), + benchmarkInputSQS(t, 128), + benchmarkInputSQS(t, 256), + benchmarkInputSQS(t, 512), + benchmarkInputSQS(t, 1024), } headers := []string{ @@ -191,3 +251,119 @@ func TestBenchmarkInput(t *testing.T) { table.AppendBulk(data) table.Render() } + +func benchmarkInputS3(t *testing.T, numberOfWorkers int) testing.BenchmarkResult { + return testing.Benchmark(func(b *testing.B) { + log := logp.NewLogger(inputName) + metricRegistry := monitoring.NewRegistry() + metrics := newInputMetrics(metricRegistry, "test_id") + s3API := newConstantS3(t) + s3API.pagerConstant = newS3PagerConstant() + client := pubtest.NewChanClientWithCallback(100, func(event beat.Event) { + event.Private.(*eventACKTracker).ACK() + }) + + defer close(client.Channel) + conf := makeBenchmarkConfig(t) + + storeReg := statestore.NewRegistry(storetest.NewMemoryStoreBackend()) + store, err := storeReg.Get("test") + if err != nil { + t.Fatalf("Failed to access store: %v", err) + } + + err = store.Set(awsS3WriteCommitPrefix+"bucket", &commitWriteState{time.Time{}}) + if err != nil { + t.Fatalf("Failed to reset store: %v", err) + } + + s3EventHandlerFactory := newS3ObjectProcessorFactory(log.Named("s3"), metrics, s3API, client, conf.FileSelectors) + s3Poller := newS3Poller(logp.NewLogger(inputName), metrics, s3API, s3EventHandlerFactory, newStates(), store, "bucket", numberOfWorkers, time.Second) + + ctx, cancel := context.WithCancel(context.Background()) + b.Cleanup(cancel) + + go func() { + for metrics.s3ObjectsAckedTotal.Get() < totListingObjects { + time.Sleep(5 * time.Millisecond) + } + cancel() + }() + + b.ResetTimer() + start := time.Now() + if err := s3Poller.Poll(ctx); err != nil { + if !errors.Is(err, context.DeadlineExceeded) { + t.Fatal(err) + } + } + b.StopTimer() + elapsed := time.Since(start) + + b.ReportMetric(float64(numberOfWorkers), "number_of_workers") + b.ReportMetric(elapsed.Seconds(), "sec") + + b.ReportMetric(float64(metrics.s3EventsCreatedTotal.Get()), "events") + b.ReportMetric(float64(metrics.s3EventsCreatedTotal.Get())/elapsed.Seconds(), "events_per_sec") + + b.ReportMetric(float64(metrics.s3BytesProcessedTotal.Get()), "s3_bytes") + b.ReportMetric(float64(metrics.s3BytesProcessedTotal.Get())/elapsed.Seconds(), "s3_bytes_per_sec") + + b.ReportMetric(float64(metrics.s3ObjectsListedTotal.Get()), "objects_listed") + b.ReportMetric(float64(metrics.s3ObjectsListedTotal.Get())/elapsed.Seconds(), "objects_listed_per_sec") + + b.ReportMetric(float64(metrics.s3ObjectsProcessedTotal.Get()), "objects_processed") + b.ReportMetric(float64(metrics.s3ObjectsProcessedTotal.Get())/elapsed.Seconds(), "objects_processed_per_sec") + + b.ReportMetric(float64(metrics.s3ObjectsAckedTotal.Get()), "objects_acked") + b.ReportMetric(float64(metrics.s3ObjectsAckedTotal.Get())/elapsed.Seconds(), "objects_acked_per_sec") + + }) +} + +func TestBenchmarkInputS3(t *testing.T) { + logp.TestingSetup(logp.WithLevel(logp.InfoLevel)) + + results := []testing.BenchmarkResult{ + benchmarkInputS3(t, 1), + benchmarkInputS3(t, 2), + benchmarkInputS3(t, 4), + benchmarkInputS3(t, 8), + benchmarkInputS3(t, 16), + benchmarkInputS3(t, 32), + benchmarkInputS3(t, 64), + benchmarkInputS3(t, 128), + benchmarkInputS3(t, 256), + benchmarkInputS3(t, 512), + benchmarkInputS3(t, 1024), + } + + headers := []string{ + "Number of workers", + "Objects listed per sec", + "Objects processed per sec", + "Objects acked per sec", + "Events per sec", + "S3 Bytes per sec", + "Time (sec)", + "CPUs", + } + var data [][]string + for _, r := range results { + data = append(data, []string{ + fmt.Sprintf("%v", r.Extra["number_of_workers"]), + fmt.Sprintf("%v", r.Extra["objects_listed_per_sec"]), + fmt.Sprintf("%v", r.Extra["objects_processed_per_sec"]), + fmt.Sprintf("%v", r.Extra["objects_acked_per_sec"]), + fmt.Sprintf("%v", r.Extra["events_per_sec"]), + fmt.Sprintf("%v", humanize.Bytes(uint64(r.Extra["s3_bytes_per_sec"]))), + fmt.Sprintf("%v", r.Extra["sec"]), + fmt.Sprintf("%v", runtime.GOMAXPROCS(0)), + }) + } + + table := tablewriter.NewWriter(os.Stdout) + table.SetHeader(headers) + table.AppendBulk(data) + table.Render() +} diff --git a/x-pack/filebeat/input/awss3/metrics.go b/x-pack/filebeat/input/awss3/metrics.go index 57f565612681..6a7bd59599c4 100644 --- a/x-pack/filebeat/input/awss3/metrics.go +++ b/x-pack/filebeat/input/awss3/metrics.go @@ -25,6 +25,9 @@ type inputMetrics struct { sqsMessageProcessingTime metrics.Sample // Histogram of the elapsed SQS processing times in nanoseconds (time of receipt to time of delete/return). s3ObjectsRequestedTotal *monitoring.Uint // Number of S3 objects downloaded. + s3ObjectsAckedTotal *monitoring.Uint // Number of S3 objects fully ACKed. + s3ObjectsListedTotal *monitoring.Uint // Number of S3 objects listed. + s3ObjectsProcessedTotal *monitoring.Uint // Number of S3 objects processed. s3BytesProcessedTotal *monitoring.Uint // Number of S3 bytes processed. s3EventsCreatedTotal *monitoring.Uint // Number of events created from processing S3 data. s3ObjectsInflight *monitoring.Uint // Number of S3 objects inflight (gauge). @@ -50,6 +53,9 @@ func newInputMetrics(parent *monitoring.Registry, id string) *inputMetrics { sqsMessagesDeletedTotal: monitoring.NewUint(reg, "sqs_messages_deleted_total"), sqsMessageProcessingTime: metrics.NewUniformSample(1024), s3ObjectsRequestedTotal: monitoring.NewUint(reg, "s3_objects_requested_total"), + s3ObjectsAckedTotal: monitoring.NewUint(reg, "s3_objects_acked_total"), + s3ObjectsListedTotal: monitoring.NewUint(reg, "s3_objects_listed_total"), + s3ObjectsProcessedTotal: monitoring.NewUint(reg, "s3_objects_processed_total"), s3BytesProcessedTotal: monitoring.NewUint(reg, "s3_bytes_processed_total"), s3EventsCreatedTotal: monitoring.NewUint(reg, "s3_events_created_total"), s3ObjectsInflight: monitoring.NewUint(reg, "s3_objects_inflight_gauge"), diff --git a/x-pack/filebeat/input/awss3/s3.go b/x-pack/filebeat/input/awss3/s3.go index c67266d61ef9..c7fdab08f753 100644 --- a/x-pack/filebeat/input/awss3/s3.go +++ b/x-pack/filebeat/input/awss3/s3.go @@ -81,7 +81,7 @@ func newS3Poller(log *logp.Logger, } } -func (p *s3Poller) process(s3ObjectPayloadChan <-chan *s3ObjectPayload) error { +func (p *s3Poller) ProcessObject(s3ObjectPayloadChan <-chan *s3ObjectPayload, workerN int) error { var errs []error processingLoop: @@ -92,18 +92,9 @@ processingLoop: break processingLoop } - info := s3ObjectPayload.s3ObjectInfo - state := newState(info.name, info.key, info.etag, info.lastModified) - //check if another worker already is on it - dummyValue := struct{}{} - _, loaded := p.workersProcessingMap.LoadOrStore(state.Id, dummyValue) - if loaded { - // another worker is processing the state - continue - } - // Process S3 object (download, parse, create events). err := s3ObjectPayload.s3ObjectHandler.ProcessS3Object() + // Wait for all events to be ACKed before proceeding. s3ObjectPayload.s3ObjectHandler.Wait() @@ -113,57 +104,63 @@ processingLoop: "failed processing S3 event for object key %q in bucket %q", event.S3.Object.Key, event.S3.Bucket.Name)) - // Manage locks for processing. - p.workersProcessingMap.Delete(state.Id) - continue } - // Manage locks for purging. + info := s3ObjectPayload.s3ObjectInfo id := info.name + info.key previousState := p.states.FindPreviousByID(id) if !previousState.IsEmpty() { - state.MarkAsStored() - p.states.Update(state, info.listingID) - p.states.writeStates(p.store) + previousState.MarkAsStored() + p.states.Update(previousState, info.listingID) } + // Manage locks for purging. if p.states.IsListingFullyStored(info.listingID) { // locked on processing we unlock when all the object were ACKed lock, _ := p.workersListingMap.Load(info.listingID) lock.(*sync.Mutex).Unlock() } - // Manage locks for processing. - p.workersProcessingMap.Delete(state.Id) + // Metrics + p.metrics.s3ObjectsAckedTotal.Inc() } } return multierr.Combine(errs...) } -func (p *s3Poller) getS3Objects(ctx context.Context, s3ObjectPayloadChan chan<- *s3ObjectPayload) { - defer close(s3ObjectPayloadChan) +func (p *s3Poller) GetS3Objects(ctx context.Context, s3ObjectPayloadChan chan<- *s3ObjectPayload) { + defer func() { + close(s3ObjectPayloadChan) + }() + bucketMetadata := strings.Split(p.bucket, ":") bucketName := bucketMetadata[len(bucketMetadata)-1] paginator := p.s3.ListObjectsPaginator(bucketName) for paginator.Next(ctx) { - lock := new(sync.Mutex) listingID, err := uuid.NewV4() if err != nil { p.log.Warnw("Error generating UUID for listing page.", "error", err) continue } + // lock for the listing page and state in workersListingMap // this map is shared with the storedOp and will be unlocked there + lock := new(sync.Mutex) lock.Lock() p.workersListingMap.Store(listingID.String(), lock) - totProcessableObjects := 0 page := paginator.CurrentPage() - s3ObjectPayloadChanByPage := make(chan *s3ObjectPayload, len(page.Contents)) + + totProcessableObjects := 0 + totListedObjects := len(page.Contents) + s3ObjectPayloadChanByPage := make(chan *s3ObjectPayload, totListedObjects) + + // Metrics + p.metrics.s3ObjectsListedTotal.Add(uint64(totListedObjects)) for _, object := range page.Contents { // Unescape s3 key name. For example, convert "%3D" back to "=". filename, err := url.QueryUnescape(*object.Key) @@ -206,11 +203,16 @@ func (p *s3Poller) getS3Objects(ctx context.Context, s3ObjectPayloadChan chan<- } } - listingInfo := listingInfo{totObjects: totProcessableObjects} - p.states.AddListing(listingID.String(), listingInfo) if totProcessableObjects == 0 { // nothing to be ACKed, unlock here + p.states.DeleteListing(listingID.String()) lock.Unlock() + } else { + listingInfo := &listingInfo{totObjects: totProcessableObjects} + p.states.AddListing(listingID.String(), listingInfo) + + // Metrics + p.metrics.s3ObjectsProcessedTotal.Add(uint64(totProcessableObjects)) } close(s3ObjectPayloadChanByPage) @@ -226,11 +228,19 @@ func (p *s3Poller) getS3Objects(ctx context.Context, s3ObjectPayloadChan chan<- return } -func (p *s3Poller) purge() { - for _, listingID := range p.states.GetListingIDs() { +func (p *s3Poller) Purge() { + listingIDs := p.states.GetListingIDs() + for _, listingID := range listingIDs { // we lock here in order to process the purge only after // full listing page is ACKed by all the workers - lock, _ := p.workersListingMap.Load(listingID) + lock, loaded := p.workersListingMap.Load(listingID) + if !loaded { + // purge calls can overlap, GetListingIDs can return + // an outdated snapshot with listing already purged + p.states.DeleteListing(listingID) + continue + } + lock.(*sync.Mutex).Lock() keys := map[string]struct{}{} @@ -276,7 +286,6 @@ func (p *s3Poller) purge() { } } - // workersListingMap map is shared with the storedop // purge is done, we can unlock and clean lock.(*sync.Mutex).Unlock() p.workersListingMap.Delete(listingID) @@ -292,36 +301,39 @@ func (p *s3Poller) Poll(ctx context.Context) error { // listing, sequentially processes every object and then does another listing workerWg := new(sync.WaitGroup) for ctx.Err() == nil { - // Determine how many S3 workers are available. workers, err := p.workerSem.AcquireContext(p.numberOfWorkers, ctx) if err != nil { break } + if workers == 0 { + continue + } + s3ObjectPayloadChan := make(chan *s3ObjectPayload) workerWg.Add(1) go func() { defer workerWg.Done() - p.getS3Objects(ctx, s3ObjectPayloadChan) + + p.GetS3Objects(ctx, s3ObjectPayloadChan) + p.Purge() }() workerWg.Add(workers) for i := 0; i < workers; i++ { - go func() { + go func(i int) { defer func() { workerWg.Done() p.workerSem.Release(1) }() - if err := p.process(s3ObjectPayloadChan); err != nil { + if err := p.ProcessObject(s3ObjectPayloadChan, i); err != nil { p.log.Warnw("Failed processing S3 listing.", "error", err) } - }() + }(i) } - p.purge() - <-time.After(p.bucketPollInterval) } diff --git a/x-pack/filebeat/input/awss3/s3_objects.go b/x-pack/filebeat/input/awss3/s3_objects.go index 4841bebdaad9..f486fbac0321 100644 --- a/x-pack/filebeat/input/awss3/s3_objects.go +++ b/x-pack/filebeat/input/awss3/s3_objects.go @@ -303,7 +303,7 @@ func (p *s3ObjectProcessor) readFile(r io.Reader) error { } func (p *s3ObjectProcessor) publish(ack *eventACKTracker, event *beat.Event) { - ack.Add(1) + ack.Add() event.Private = ack p.metrics.s3EventsCreatedTotal.Inc() p.publisher.Publish(*event) diff --git a/x-pack/filebeat/input/awss3/states.go b/x-pack/filebeat/input/awss3/states.go index a13230f9e25c..8b1f89a9a33d 100644 --- a/x-pack/filebeat/input/awss3/states.go +++ b/x-pack/filebeat/input/awss3/states.go @@ -21,6 +21,7 @@ const ( type listingInfo struct { totObjects int storedObjects int + finalCheck bool } // states handles list of s3 object state. One must use newStates to instantiate a @@ -52,10 +53,6 @@ func newStates() *states { func (s *states) MustSkip(state state, store *statestore.Store) bool { if !s.IsNew(state) { - // here we should purge from the store - s.Delete(state.Id) - s.writeStates(store) - return true } @@ -89,22 +86,29 @@ func (s *states) Delete(id string) { last := len(s.states) - 1 s.states[last], s.states[index] = s.states[index], s.states[last] s.states = s.states[:last] - } - s.idx = map[string]int{} - for i, state := range s.states { - s.idx[state.Id] = i + s.idx = map[string]int{} + for i, state := range s.states { + s.idx[state.Id] = i + } } } // IsListingFullyStored check if listing if fully stored +// After first time the condition is met it will always return false func (s *states) IsListingFullyStored(listingID string) bool { info, _ := s.listingInfo.Load(listingID) - return info.(listingInfo).storedObjects == info.(listingInfo).totObjects + listingInfo := info.(*listingInfo) + if listingInfo.finalCheck { + return false + } + + listingInfo.finalCheck = listingInfo.storedObjects == listingInfo.totObjects + return listingInfo.finalCheck } // AddListing add listing info -func (s *states) AddListing(listingID string, listingInfo listingInfo) { +func (s *states) AddListing(listingID string, listingInfo *listingInfo) { s.Lock() defer s.Unlock() s.listingIDs[listingID] = struct{}{} @@ -141,12 +145,10 @@ func (s *states) Update(newState state, listingID string) { return } - // listing map is shared with the collector // here we increase the number of stored object info, _ := s.listingInfo.Load(listingID) - listingInfo := info.(listingInfo) + listingInfo := info.(*listingInfo) listingInfo.storedObjects++ - s.listingInfo.Store(listingID, listingInfo) if _, ok := s.statesByListingID[listingID]; !ok { s.statesByListingID[listingID] = make([]state, 0) From d7e6d749b0ad8a3be0566a0163679de0a954e8dd Mon Sep 17 00:00:00 2001 From: Andrea Spacca Date: Fri, 13 Aug 2021 13:16:30 +0200 Subject: [PATCH 10/20] fix bug --- x-pack/filebeat/input/awss3/s3.go | 51 ++++++++++++++--------- x-pack/filebeat/input/awss3/s3_test.go | 4 +- x-pack/filebeat/input/awss3/state.go | 8 ++++ x-pack/filebeat/input/awss3/state_test.go | 18 ++++++++ x-pack/filebeat/input/awss3/states.go | 15 +++++-- 5 files changed, 71 insertions(+), 25 deletions(-) diff --git a/x-pack/filebeat/input/awss3/s3.go b/x-pack/filebeat/input/awss3/s3.go index c7fdab08f753..a103bf2964fb 100644 --- a/x-pack/filebeat/input/awss3/s3.go +++ b/x-pack/filebeat/input/awss3/s3.go @@ -81,7 +81,28 @@ func newS3Poller(log *logp.Logger, } } -func (p *s3Poller) ProcessObject(s3ObjectPayloadChan <-chan *s3ObjectPayload, workerN int) error { +func (p *s3Poller) handlePurgingLock(info s3ObjectInfo, isStored bool) { + id := info.name + info.key + previousState := p.states.FindPreviousByID(id) + if !previousState.IsEmpty() { + if isStored { + previousState.MarkAsStored() + } else { + previousState.MarkAsError() + } + + p.states.Update(previousState, info.listingID) + } + + // Manage locks for purging. + if p.states.IsListingFullyStored(info.listingID) { + // locked on processing we unlock when all the object were ACKed + lock, _ := p.workersListingMap.Load(info.listingID) + lock.(*sync.Mutex).Unlock() + } +} + +func (p *s3Poller) ProcessObject(s3ObjectPayloadChan <-chan *s3ObjectPayload) error { var errs []error processingLoop: @@ -98,30 +119,20 @@ processingLoop: // Wait for all events to be ACKed before proceeding. s3ObjectPayload.s3ObjectHandler.Wait() + info := s3ObjectPayload.s3ObjectInfo + if err != nil { event := s3ObjectPayload.s3ObjectEvent errs = append(errs, errors.Wrapf(err, "failed processing S3 event for object key %q in bucket %q", event.S3.Object.Key, event.S3.Bucket.Name)) + p.handlePurgingLock(info, false) continue } - info := s3ObjectPayload.s3ObjectInfo - id := info.name + info.key - previousState := p.states.FindPreviousByID(id) - if !previousState.IsEmpty() { - previousState.MarkAsStored() - p.states.Update(previousState, info.listingID) - } - - // Manage locks for purging. - if p.states.IsListingFullyStored(info.listingID) { - // locked on processing we unlock when all the object were ACKed - lock, _ := p.workersListingMap.Load(info.listingID) - lock.(*sync.Mutex).Unlock() - } + p.handlePurgingLock(info, true) // Metrics p.metrics.s3ObjectsAckedTotal.Inc() @@ -315,7 +326,9 @@ func (p *s3Poller) Poll(ctx context.Context) error { workerWg.Add(1) go func() { - defer workerWg.Done() + defer func() { + workerWg.Done() + }() p.GetS3Objects(ctx, s3ObjectPayloadChan) p.Purge() @@ -323,15 +336,15 @@ func (p *s3Poller) Poll(ctx context.Context) error { workerWg.Add(workers) for i := 0; i < workers; i++ { - go func(i int) { + go func() { defer func() { workerWg.Done() p.workerSem.Release(1) }() - if err := p.ProcessObject(s3ObjectPayloadChan, i); err != nil { + if err := p.ProcessObject(s3ObjectPayloadChan); err != nil { p.log.Warnw("Failed processing S3 listing.", "error", err) } - }(i) + }() } <-time.After(p.bucketPollInterval) diff --git a/x-pack/filebeat/input/awss3/s3_test.go b/x-pack/filebeat/input/awss3/s3_test.go index a98e51ccaef8..bd3a544bd040 100644 --- a/x-pack/filebeat/input/awss3/s3_test.go +++ b/x-pack/filebeat/input/awss3/s3_test.go @@ -27,8 +27,8 @@ func TestS3Poller(t *testing.T) { logp.TestingSetup() const bucket = "bucket" const numberOfWorkers = 5 - const pollInterval = 10 * time.Second - const testTimeout = 5 * time.Millisecond + const pollInterval = 2 * time.Second + const testTimeout = 1 * time.Second t.Run("Poll success", func(t *testing.T) { storeReg := statestore.NewRegistry(storetest.NewMemoryStoreBackend()) diff --git a/x-pack/filebeat/input/awss3/state.go b/x-pack/filebeat/input/awss3/state.go index 0ace11ea6923..c772935a7eaa 100644 --- a/x-pack/filebeat/input/awss3/state.go +++ b/x-pack/filebeat/input/awss3/state.go @@ -19,6 +19,8 @@ type state struct { // A state has Stored = true when all events are ACKed. Stored bool `json:"stored" struct:"stored"` + // A state has Error = true when ProcessS3Object returned an error + Error bool `json:"error" struct:"error"` } // newState creates a new s3 object state @@ -29,6 +31,7 @@ func newState(bucket, key, etag string, lastModified time.Time) state { LastModified: lastModified, Etag: etag, Stored: false, + Error: false, } s.Id = s.Bucket + s.Key + s.Etag + s.LastModified.String() @@ -41,6 +44,11 @@ func (s *state) MarkAsStored() { s.Stored = true } +// MarkAsStored set the error flag to true +func (s *state) MarkAsError() { + s.Error = true +} + // IsEqual checks if the two states point to the same s3 object. func (s *state) IsEqual(c *state) bool { return s.Bucket == c.Bucket && s.Key == c.Key && s.Etag == c.Etag && s.LastModified.Equal(c.LastModified) diff --git a/x-pack/filebeat/input/awss3/state_test.go b/x-pack/filebeat/input/awss3/state_test.go index 07db57329673..b304d74920fc 100644 --- a/x-pack/filebeat/input/awss3/state_test.go +++ b/x-pack/filebeat/input/awss3/state_test.go @@ -54,6 +54,24 @@ func TestStateIsEqual(t *testing.T) { }, true, }, + "two states pointing to the same key with same etag and same last modified error": { + [2]state{ + state{ + Bucket: "bucket a", + Key: "/key/to/this/file/1", + Etag: "etag", + LastModified: lastModifed, + Error: true, + }, + state{ + Bucket: "bucket a", + Key: "/key/to/this/file/1", + Etag: "etag", + LastModified: lastModifed, + }, + }, + true, + }, "two states pointing to the same key with different etag and same last modified": { [2]state{ state{ diff --git a/x-pack/filebeat/input/awss3/states.go b/x-pack/filebeat/input/awss3/states.go index 8b1f89a9a33d..154e4a5490f3 100644 --- a/x-pack/filebeat/input/awss3/states.go +++ b/x-pack/filebeat/input/awss3/states.go @@ -21,6 +21,7 @@ const ( type listingInfo struct { totObjects int storedObjects int + errorObjects int finalCheck bool } @@ -70,7 +71,7 @@ func (s *states) MustSkip(state state, store *statestore.Store) bool { // we have no previous state or the previous state // is not stored: refresh the state - if previousState.IsEmpty() || !previousState.Stored { + if previousState.IsEmpty() || (!previousState.Stored && !previousState.Error) { s.Update(state, "") } @@ -103,7 +104,7 @@ func (s *states) IsListingFullyStored(listingID string) bool { return false } - listingInfo.finalCheck = listingInfo.storedObjects == listingInfo.totObjects + listingInfo.finalCheck = (listingInfo.storedObjects + listingInfo.errorObjects) == listingInfo.totObjects return listingInfo.finalCheck } @@ -141,14 +142,20 @@ func (s *states) Update(newState state, listingID string) { logp.Debug("input", "New state added for %s", newState.Id) } - if listingID == "" || !newState.Stored { + if listingID == "" || (!newState.Stored && !newState.Error) { return } // here we increase the number of stored object info, _ := s.listingInfo.Load(listingID) listingInfo := info.(*listingInfo) - listingInfo.storedObjects++ + if newState.Stored { + listingInfo.storedObjects++ + } + + if newState.Error { + listingInfo.errorObjects++ + } if _, ok := s.statesByListingID[listingID]; !ok { s.statesByListingID[listingID] = make([]state, 0) From a997b912b7994b70e62812cf57cccec413ab8e18 Mon Sep 17 00:00:00 2001 From: Andrea Spacca Date: Fri, 13 Aug 2021 15:31:53 +0200 Subject: [PATCH 11/20] cr fixes --- x-pack/filebeat/input/awss3/s3.go | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/x-pack/filebeat/input/awss3/s3.go b/x-pack/filebeat/input/awss3/s3.go index a103bf2964fb..150b54444c95 100644 --- a/x-pack/filebeat/input/awss3/s3.go +++ b/x-pack/filebeat/input/awss3/s3.go @@ -11,14 +11,14 @@ import ( "sync" "time" - "github.com/elastic/beats/v7/libbeat/statestore" - "github.com/gofrs/uuid" "github.com/pkg/errors" "go.uber.org/multierr" "github.com/elastic/beats/v7/libbeat/logp" "github.com/elastic/beats/v7/libbeat/monitoring" + "github.com/elastic/beats/v7/libbeat/statestore" + "github.com/elastic/go-concert/timed" ) type commitWriteState struct { @@ -347,7 +347,8 @@ func (p *s3Poller) Poll(ctx context.Context) error { }() } - <-time.After(p.bucketPollInterval) + timed.Wait(ctx, p.bucketPollInterval) + } // Wait for all workers to finish. From 0895cbca213044ba28e66e149f370c17c39e98b0 Mon Sep 17 00:00:00 2001 From: Andrea Spacca Date: Fri, 13 Aug 2021 16:04:55 +0200 Subject: [PATCH 12/20] integration test --- .../input/awss3/_meta/terraform/README.md | 10 +- .../input/awss3/input_integration_test.go | 128 +++++++++++++++++- 2 files changed, 127 insertions(+), 11 deletions(-) diff --git a/x-pack/filebeat/input/awss3/_meta/terraform/README.md b/x-pack/filebeat/input/awss3/_meta/terraform/README.md index cdb209e7099f..3f18fc424a03 100644 --- a/x-pack/filebeat/input/awss3/_meta/terraform/README.md +++ b/x-pack/filebeat/input/awss3/_meta/terraform/README.md @@ -20,7 +20,9 @@ write the `outputs.yml`. `terraform apply` -2. (Optional) View the output configuration. + +2. View the output configuration and assure the region match in the aws profile used to run +the test or to set the environment variable `AWS_REGION` to the value in the output. ```yaml "aws_region": "us-east-1" @@ -28,14 +30,14 @@ write the `outputs.yml`. "queue_url": "https://sqs.us-east-1.amazonaws.com/144492464627/filebeat-s3-integtest-8iok1h" ``` -2. Execute the integration test. +4. Execute the integration test. ``` cd x-pack/filebeat/inputs/awss3 - go test -tags aws,integration -run TestInputRun -v . + go test -tags aws,integration -run TestInputRun.+ -v . ``` -3. Cleanup AWS resources. Execute terraform to remove the SQS queue and delete +5. Cleanup AWS resources. Execute terraform to remove the SQS queue and delete the S3 bucket and its contents. `terraform destroy` diff --git a/x-pack/filebeat/input/awss3/input_integration_test.go b/x-pack/filebeat/input/awss3/input_integration_test.go index 0580b6f067b2..80f2dbc15f03 100644 --- a/x-pack/filebeat/input/awss3/input_integration_test.go +++ b/x-pack/filebeat/input/awss3/input_integration_test.go @@ -4,9 +4,6 @@ // See _meta/terraform/README.md for integration test usage instructions. -// +build integration -// +build aws - package awss3 import ( @@ -19,6 +16,8 @@ import ( "testing" "time" + "github.com/elastic/beats/v7/filebeat/beater" + "github.com/aws/aws-sdk-go-v2/aws" "github.com/aws/aws-sdk-go-v2/aws/external" "github.com/aws/aws-sdk-go-v2/service/s3/s3manager" @@ -32,6 +31,8 @@ import ( "github.com/elastic/beats/v7/libbeat/logp" "github.com/elastic/beats/v7/libbeat/monitoring" pubtest "github.com/elastic/beats/v7/libbeat/publisher/testing" + "github.com/elastic/beats/v7/libbeat/statestore" + "github.com/elastic/beats/v7/libbeat/statestore/storetest" ) const ( @@ -69,7 +70,35 @@ func getTerraformOutputs(t *testing.T) terraformOutputData { return rtn } -func makeTestConfig(queueURL string) *common.Config { +func makeTestConfigS3(s3bucket string) *common.Config { + return common.MustNewConfigFrom(fmt.Sprintf(`--- +s3_bucket: aws:s3:::%s +s3_bucket_number_of_workers: 1 +file_selectors: +- + regex: 'events-array.json$' + expand_event_list_from_field: Events + content_type: application/json + include_s3_metadata: + - last-modified + - x-amz-version-id + - x-amz-storage-class + - Content-Length + - Content-Type +- + regex: '\.(?:nd)?json(\.gz)?$' + content_type: application/json +- + regex: 'multiline.txt$' + parsers: + - multiline: + pattern: "^ Date: Fri, 13 Aug 2021 18:36:05 +0200 Subject: [PATCH 13/20] revert missing build tags --- x-pack/filebeat/input/awss3/input_integration_test.go | 3 +++ 1 file changed, 3 insertions(+) diff --git a/x-pack/filebeat/input/awss3/input_integration_test.go b/x-pack/filebeat/input/awss3/input_integration_test.go index 80f2dbc15f03..7d83769a4f51 100644 --- a/x-pack/filebeat/input/awss3/input_integration_test.go +++ b/x-pack/filebeat/input/awss3/input_integration_test.go @@ -4,6 +4,9 @@ // See _meta/terraform/README.md for integration test usage instructions. +// +build integration +// +build aws + package awss3 import ( From 193664b351cabdf69c9706dc29a2e1e39c2cd114 Mon Sep 17 00:00:00 2001 From: Andrea Spacca Date: Mon, 16 Aug 2021 11:53:32 +0200 Subject: [PATCH 14/20] cr fixes --- filebeat/docs/modules/aws.asciidoc | 16 +++--- .../docs/inputs/input-aws-s3.asciidoc | 17 +++--- .../input/awss3/_meta/terraform/README.md | 6 +- x-pack/filebeat/input/awss3/acker.go | 2 +- x-pack/filebeat/input/awss3/config.go | 55 +++++++++---------- x-pack/filebeat/input/awss3/config_test.go | 51 +++++++---------- x-pack/filebeat/input/awss3/input.go | 33 +++++------ .../input/awss3/input_benchmark_test.go | 11 ++-- .../input/awss3/input_integration_test.go | 4 +- x-pack/filebeat/input/awss3/metrics.go | 9 ++- x-pack/filebeat/input/awss3/s3.go | 17 +++--- x-pack/filebeat/input/awss3/s3_test.go | 4 +- x-pack/filebeat/input/awss3/state.go | 15 +++-- x-pack/filebeat/input/awss3/states.go | 25 +++++---- x-pack/filebeat/input/awss3/states_test.go | 35 +++++++----- .../filebeat/module/aws/_meta/docs.asciidoc | 42 ++++++++------ .../module/aws/cloudtrail/config/aws-s3.yml | 12 ++-- .../module/aws/cloudtrail/manifest.yml | 6 +- .../module/aws/cloudwatch/config/aws-s3.yml | 12 ++-- .../module/aws/cloudwatch/manifest.yml | 6 +- .../filebeat/module/aws/ec2/config/aws-s3.yml | 12 ++-- x-pack/filebeat/module/aws/ec2/manifest.yml | 6 +- .../filebeat/module/aws/elb/config/aws-s3.yml | 12 ++-- x-pack/filebeat/module/aws/elb/manifest.yml | 6 +- .../module/aws/s3access/config/aws-s3.yml | 12 ++-- .../filebeat/module/aws/s3access/manifest.yml | 6 +- .../module/aws/vpcflow/config/input.yml | 12 ++-- .../filebeat/module/aws/vpcflow/manifest.yml | 6 +- 28 files changed, 230 insertions(+), 220 deletions(-) diff --git a/filebeat/docs/modules/aws.asciidoc b/filebeat/docs/modules/aws.asciidoc index 8ac1a47dfb30..d0564b71af42 100644 --- a/filebeat/docs/modules/aws.asciidoc +++ b/filebeat/docs/modules/aws.asciidoc @@ -50,7 +50,7 @@ Example config: enabled: false #var.queue_url: https://sqs.myregion.amazonaws.com/123456/myqueue #var.s3_bucket: 'arn:aws:s3:::mybucket - #var.s3_bucket_poll_interval: 300s + #var.bucket_list_interval: 300s #var.shared_credential_file: /etc/filebeat/aws_credentials #var.credential_profile_name: fb-aws #var.access_key_id: access_key_id @@ -66,7 +66,7 @@ Example config: enabled: false #var.queue_url: https://sqs.myregion.amazonaws.com/123456/myqueue #var.s3_bucket: 'arn:aws:s3:::mybucket - #var.s3_bucket_poll_interval: 300s + #var.bucket_list_interval: 300s #var.shared_credential_file: /etc/filebeat/aws_credentials #var.credential_profile_name: fb-aws #var.access_key_id: access_key_id @@ -82,7 +82,7 @@ Example config: enabled: false #var.queue_url: https://sqs.myregion.amazonaws.com/123456/myqueue #var.s3_bucket: 'arn:aws:s3:::mybucket - #var.s3_bucket_poll_interval: 300s + #var.bucket_list_interval: 300s #var.shared_credential_file: /etc/filebeat/aws_credentials #var.credential_profile_name: fb-aws #var.access_key_id: access_key_id @@ -98,7 +98,7 @@ Example config: enabled: false #var.queue_url: https://sqs.myregion.amazonaws.com/123456/myqueue #var.s3_bucket: 'arn:aws:s3:::mybucket - #var.s3_bucket_poll_interval: 300s + #var.bucket_list_interval: 300s #var.shared_credential_file: /etc/filebeat/aws_credentials #var.credential_profile_name: fb-aws #var.access_key_id: access_key_id @@ -114,7 +114,7 @@ Example config: enabled: false #var.queue_url: https://sqs.myregion.amazonaws.com/123456/myqueue #var.s3_bucket: 'arn:aws:s3:::mybucket - #var.s3_bucket_poll_interval: 300s + #var.bucket_list_interval: 300s #var.shared_credential_file: /etc/filebeat/aws_credentials #var.credential_profile_name: fb-aws #var.access_key_id: access_key_id @@ -130,7 +130,7 @@ Example config: enabled: false #var.queue_url: https://sqs.myregion.amazonaws.com/123456/myqueue #var.s3_bucket: 'arn:aws:s3:::mybucket - #var.s3_bucket_poll_interval: 300s + #var.bucket_list_interval: 300s #var.shared_credential_file: /etc/filebeat/aws_credentials #var.credential_profile_name: fb-aws #var.access_key_id: access_key_id @@ -160,12 +160,12 @@ Maximum duration before AWS API request will be interrupted. Default to be 120 s AWS S3 bucket ARN (Required when `var.queue_url` is not set). -*`var.s3_bucket_number_of_workers`*:: +*`var.number_of_workers`*:: Number of workers that will process the S3 objects listed (Required when `var.s3_bucket` is set). Use to vertically scale the input. -*`var.s3_bucket_poll_interval`*:: +*`var.bucket_list_interval`*:: Interval between list requests to the S3 bucket. Default to be 120 seconds. diff --git a/x-pack/filebeat/docs/inputs/input-aws-s3.asciidoc b/x-pack/filebeat/docs/inputs/input-aws-s3.asciidoc index 1893f6335291..647c3269c6bc 100644 --- a/x-pack/filebeat/docs/inputs/input-aws-s3.asciidoc +++ b/x-pack/filebeat/docs/inputs/input-aws-s3.asciidoc @@ -46,17 +46,17 @@ be stopped and the SQS message will be returned back to the queue. When using the direct polling list of S3 objects in an S3 buckets, a number of workers that will process the S3 objects listed must be set -through the `s3_bucket_number_of_workers` config. +through the `number_of_workers` config. Listing of the S3 bucket will be polled according the time interval defined by -`s3_bucket_poll_interval` config. Default value is 120secs. +`bucket_list_interval` config. Default value is 120secs. ["source","yaml",subs="attributes"] ---- {beatname_lc}.inputs: - type: aws-s3 s3_bucket: arn:aws:s3:::test-s3-bucket - s3_bucket_number_of_workers: 5 - s3_bucket_poll_interval: 300s + number_of_workers: 5 + bucket_list_interval: 300s credential_profile_name: elastic-beats expand_event_list_from_field: Records ---- @@ -273,13 +273,13 @@ value is `20s`. ARN of the AWS S3 bucket that will be polled for list operation. (Required when `queue_url` is not set). [float] -==== `s3_bucket_poll_interval` +==== `bucket_list_interval` Time interval for polling listing of the S3 bucket: default to `120s`. [float] -==== `s3_bucket_number_of_workers` +==== `number_of_workers` Number of workers that will process the S3 objects listed. (Required when `s3_bucket` is set). @@ -349,7 +349,7 @@ they can list the same S3 bucket at the same time. Since the state of the ingest and multiple {beatname_uc} cannot share the same `path.data` this will produce repeated ingestion of the S3 object. Therefore, when using the polling list of S3 bucket objects method, scaling should be -vertical, with a single bigger {beatname_uc} instance and higher `s3_bucket_number_of_workers` +vertical, with a single bigger {beatname_uc} instance and higher `number_of_workers` config value. @@ -370,6 +370,9 @@ observe the activity of the input. | `sqs_messages_deleted_total` | Number of SQS messages deleted. | `sqs_message_processing_time` | Histogram of the elapsed SQS processing times in nanoseconds (time of receipt to time of delete/return). | `s3_objects_requested_total` | Number of S3 objects downloaded. +| `s3_objects_listed_total` | Number of S3 objects returned by list operations. +| `s3_objects_processed_total` | Number of S3 objects that matched file_selectors rules. +| `s3_objects_acked_total` | Number of S3 objects processed that were fully ACKed. | `s3_bytes_processed_total` | Number of S3 bytes processed. | `s3_events_created_total` | Number of events created from processing S3 data. | `s3_objects_inflight_gauge` | Number of S3 objects inflight (gauge). diff --git a/x-pack/filebeat/input/awss3/_meta/terraform/README.md b/x-pack/filebeat/input/awss3/_meta/terraform/README.md index 3f18fc424a03..7ab27781704a 100644 --- a/x-pack/filebeat/input/awss3/_meta/terraform/README.md +++ b/x-pack/filebeat/input/awss3/_meta/terraform/README.md @@ -16,13 +16,13 @@ before running Terraform or the integration tests. The AWS key must be authorized to create and destroy S3 buckets and SQS queues. 1. Execute terraform in this directory to create the resources. This will also -write the `outputs.yml`. +write the `outputs.yml`. You can use `export TF_VAR_aws_region=NNNNN` in order +to match the AWS region of the profile you are using. `terraform apply` -2. View the output configuration and assure the region match in the aws profile used to run -the test or to set the environment variable `AWS_REGION` to the value in the output. +2. (Optional) View the output configuration. ```yaml "aws_region": "us-east-1" diff --git a/x-pack/filebeat/input/awss3/acker.go b/x-pack/filebeat/input/awss3/acker.go index 2c4cc39ee682..1ac45a8f38e1 100644 --- a/x-pack/filebeat/input/awss3/acker.go +++ b/x-pack/filebeat/input/awss3/acker.go @@ -28,7 +28,7 @@ func newEventACKTracker(ctx context.Context) *eventACKTracker { return &eventACKTracker{ctx: ctx, cancel: cancel} } -// Add increments the number of pending ACKs +// Add increments the number of pending ACKs. func (a *eventACKTracker) Add() { a.Lock() a.pendingACKs++ diff --git a/x-pack/filebeat/input/awss3/config.go b/x-pack/filebeat/input/awss3/config.go index 566a79426006..3719d4f344d8 100644 --- a/x-pack/filebeat/input/awss3/config.go +++ b/x-pack/filebeat/input/awss3/config.go @@ -19,52 +19,51 @@ import ( ) type config struct { - APITimeout time.Duration `config:"api_timeout"` - VisibilityTimeout time.Duration `config:"visibility_timeout"` - SQSWaitTime time.Duration `config:"sqs.wait_time"` // The max duration for which the SQS ReceiveMessage call waits for a message to arrive in the queue before returning. - SQSMaxReceiveCount int `config:"sqs.max_receive_count"` // The max number of times a message should be received (retried) before deleting it. - FIPSEnabled bool `config:"fips_enabled"` - MaxNumberOfMessages int `config:"max_number_of_messages"` - QueueURL string `config:"queue_url"` - S3Bucket string `config:"s3_bucket"` - S3BucketPollInterval time.Duration `config:"s3_bucket_poll_interval"` - S3BucketNumberOfWorkers int `config:"s3_bucket_number_of_workers"` - AWSConfig awscommon.ConfigAWS `config:",inline"` - FileSelectors []fileSelectorConfig `config:"file_selectors"` - ReaderConfig readerConfig `config:",inline"` // Reader options to apply when no file_selectors are used. + APITimeout time.Duration `config:"api_timeout"` + VisibilityTimeout time.Duration `config:"visibility_timeout"` + SQSWaitTime time.Duration `config:"sqs.wait_time"` // The max duration for which the SQS ReceiveMessage call waits for a message to arrive in the queue before returning. + SQSMaxReceiveCount int `config:"sqs.max_receive_count"` // The max number of times a message should be received (retried) before deleting it. + FIPSEnabled bool `config:"fips_enabled"` + MaxNumberOfMessages int `config:"max_number_of_messages"` + QueueURL string `config:"queue_url"` + Bucket string `config:"bucket"` + BucketListInterval time.Duration `config:"bucket_list_interval"` + NumberOfWorkers int `config:"number_of_workers"` + AWSConfig awscommon.ConfigAWS `config:",inline"` + FileSelectors []fileSelectorConfig `config:"file_selectors"` + ReaderConfig readerConfig `config:",inline"` // Reader options to apply when no file_selectors are used. } func defaultConfig() config { c := config{ - APITimeout: 120 * time.Second, - VisibilityTimeout: 300 * time.Second, - S3BucketPollInterval: 120 * time.Second, - SQSWaitTime: 20 * time.Second, - SQSMaxReceiveCount: 5, - FIPSEnabled: false, - MaxNumberOfMessages: 5, + APITimeout: 120 * time.Second, + VisibilityTimeout: 300 * time.Second, + BucketListInterval: 120 * time.Second, + SQSWaitTime: 20 * time.Second, + SQSMaxReceiveCount: 5, + FIPSEnabled: false, + MaxNumberOfMessages: 5, } c.ReaderConfig.InitDefaults() return c } func (c *config) Validate() error { - if c.QueueURL == "" && c.S3Bucket == "" { + if c.QueueURL == "" && c.Bucket == "" { return fmt.Errorf("queue_url or s3_bucket must provided") } - if c.QueueURL != "" && c.S3Bucket != "" { + if c.QueueURL != "" && c.Bucket != "" { return fmt.Errorf("queue_url <%v> and s3_bucket <%v> "+ - "cannot be set at the same time", c.QueueURL, c.S3Bucket) + "cannot be set at the same time", c.QueueURL, c.Bucket) } - if c.S3Bucket != "" && (c.S3BucketPollInterval <= 0 || c.S3BucketPollInterval.Hours() > 12) { - return fmt.Errorf("s3_bucket_poll_interval <%v> must be greater than 0 and "+ - "less than or equal to 12h", c.S3BucketPollInterval) + if c.Bucket != "" && c.BucketListInterval <= 0 { + return fmt.Errorf("bucket_list_interval <%v> must be greater than 0", c.BucketListInterval) } - if c.S3Bucket != "" && c.S3BucketNumberOfWorkers <= 0 { - return fmt.Errorf("s3_bucket_number_of_workers <%v> must be greater than 0", c.S3BucketNumberOfWorkers) + if c.Bucket != "" && c.NumberOfWorkers <= 0 { + return fmt.Errorf("number_of_workers <%v> must be greater than 0", c.NumberOfWorkers) } if c.QueueURL != "" && (c.VisibilityTimeout <= 0 || c.VisibilityTimeout.Hours() > 12) { diff --git a/x-pack/filebeat/input/awss3/config_test.go b/x-pack/filebeat/input/awss3/config_test.go index b50c71d576bd..57b38987aff5 100644 --- a/x-pack/filebeat/input/awss3/config_test.go +++ b/x-pack/filebeat/input/awss3/config_test.go @@ -28,15 +28,15 @@ func TestConfig(t *testing.T) { parserConf := parser.Config{} require.NoError(t, parserConf.Unpack(common.MustNewConfigFrom(""))) return config{ - QueueURL: quequeURL, - S3Bucket: s3Bucket, - APITimeout: 120 * time.Second, - VisibilityTimeout: 300 * time.Second, - SQSMaxReceiveCount: 5, - SQSWaitTime: 20 * time.Second, - S3BucketPollInterval: 120 * time.Second, - FIPSEnabled: false, - MaxNumberOfMessages: 5, + QueueURL: quequeURL, + Bucket: s3Bucket, + APITimeout: 120 * time.Second, + VisibilityTimeout: 300 * time.Second, + SQSMaxReceiveCount: 5, + SQSWaitTime: 20 * time.Second, + BucketListInterval: 120 * time.Second, + FIPSEnabled: false, + MaxNumberOfMessages: 5, ReaderConfig: readerConfig{ BufferSize: 16 * humanize.KiByte, MaxBytes: 10 * humanize.MiByte, @@ -69,13 +69,13 @@ func TestConfig(t *testing.T) { "", s3Bucket, common.MapStr{ - "s3_bucket": s3Bucket, - "s3_bucket_number_of_workers": 5, + "s3_bucket": s3Bucket, + "number_of_workers": 5, }, "", func(queueURL, s3Bucket string) config { c := makeConfig("", s3Bucket) - c.S3BucketNumberOfWorkers = 5 + c.NumberOfWorkers = 5 return c }, }, @@ -160,36 +160,25 @@ func TestConfig(t *testing.T) { nil, }, { - "error on s3_bucket_poll_interval == 0", + "error on bucket_list_interval == 0", "", s3Bucket, common.MapStr{ - "s3_bucket": s3Bucket, - "s3_bucket_poll_interval": "0", + "s3_bucket": s3Bucket, + "bucket_list_interval": "0", }, - "s3_bucket_poll_interval <0s> must be greater than 0 and less than or equal to 12h", + "bucket_list_interval <0s> must be greater than 0", nil, }, { - "error on s3_bucket_poll_interval > 12h", + "error on number_of_workers == 0", "", s3Bucket, common.MapStr{ - "s3_bucket": s3Bucket, - "s3_bucket_poll_interval": "12h1ns", + "s3_bucket": s3Bucket, + "number_of_workers": "0", }, - "s3_bucket_poll_interval <12h0m0.000000001s> must be greater than 0 and less than or equal to 12h", - nil, - }, - { - "error on s3_bucket_number_of_workers == 0", - "", - s3Bucket, - common.MapStr{ - "s3_bucket": s3Bucket, - "s3_bucket_number_of_workers": "0", - }, - "s3_bucket_number_of_workers <0> must be greater than 0", + "number_of_workers <0> must be greater than 0", nil, }, { diff --git a/x-pack/filebeat/input/awss3/input.go b/x-pack/filebeat/input/awss3/input.go index cd8a54d744e4..d12b2a24cee5 100644 --- a/x-pack/filebeat/input/awss3/input.go +++ b/x-pack/filebeat/input/awss3/input.go @@ -10,14 +10,11 @@ import ( "net/url" "strings" - "github.com/elastic/go-concert/unison" - - "github.com/elastic/beats/v7/filebeat/beater" - awssdk "github.com/aws/aws-sdk-go-v2/aws" "github.com/aws/aws-sdk-go-v2/service/s3" "github.com/aws/aws-sdk-go-v2/service/sqs" + "github.com/elastic/beats/v7/filebeat/beater" v2 "github.com/elastic/beats/v7/filebeat/input/v2" "github.com/elastic/beats/v7/libbeat/beat" "github.com/elastic/beats/v7/libbeat/common" @@ -25,6 +22,7 @@ import ( "github.com/elastic/beats/v7/libbeat/monitoring" "github.com/elastic/beats/v7/libbeat/statestore" awscommon "github.com/elastic/beats/v7/x-pack/libbeat/common/aws" + "github.com/elastic/go-concert/unison" ) const inputName = "aws-s3" @@ -40,8 +38,7 @@ func Plugin(store beater.StateStore) v2.Plugin { } type s3InputManager struct { - s3Input *s3Input - store beater.StateStore + store beater.StateStore } func (im *s3InputManager) Init(grp unison.Group, mode v2.Mode) error { @@ -88,17 +85,15 @@ func (in *s3Input) Run(inputContext v2.Context, pipeline beat.Pipeline) error { persistentStore, err := in.store.Access() if err != nil { - return fmt.Errorf("Can not access persistent store: %w", err) + return fmt.Errorf("can not access persistent store: %w", err) } - defer func() { - persistentStore.Close() - }() + defer persistentStore.Close() - states := newStates() + states := newStates(inputContext) err = states.readStatesFrom(persistentStore) if err != nil { - return fmt.Errorf("Can not start persistent store: %w", err) + return fmt.Errorf("can not start persistent store: %w", err) } // Wrap input Context's cancellation Done channel a context.Context. This @@ -143,7 +138,7 @@ func (in *s3Input) Run(inputContext v2.Context, pipeline beat.Pipeline) error { } } - if in.config.S3Bucket != "" { + if in.config.Bucket != "" { // Create S3 receiver and S3 notification processor. poller, err := in.createS3Lister(inputContext, client, persistentStore, states) if err != nil { @@ -208,9 +203,9 @@ func (in *s3Input) createS3Lister(ctx v2.Context, client beat.Client, persistent client: s3.New(awscommon.EnrichAWSConfigWithEndpoint(in.config.AWSConfig.Endpoint, s3ServiceName, in.awsConfig.Region, in.awsConfig)), } - log := ctx.Logger.With("s3_bucket", in.config.S3Bucket) - log.Infof("s3_bucket_number_of_workers is set to %v.", in.config.S3BucketNumberOfWorkers) - log.Infof("s3_bucket_poll_interval is set to %v.", in.config.S3BucketPollInterval) + log := ctx.Logger.With("s3_bucket", in.config.Bucket) + log.Infof("number_of_workers is set to %v.", in.config.NumberOfWorkers) + log.Infof("bucket_list_interval is set to %v.", in.config.BucketListInterval) log.Infof("AWS region is set to %v.", in.awsConfig.Region) log.Debugf("AWS S3 service name is %v.", s3ServiceName) @@ -228,9 +223,9 @@ func (in *s3Input) createS3Lister(ctx v2.Context, client beat.Client, persistent s3EventHandlerFactory, states, persistentStore, - in.config.S3Bucket, - in.config.S3BucketNumberOfWorkers, - in.config.S3BucketPollInterval) + in.config.Bucket, + in.config.NumberOfWorkers, + in.config.BucketListInterval) return s3Poller, nil } diff --git a/x-pack/filebeat/input/awss3/input_benchmark_test.go b/x-pack/filebeat/input/awss3/input_benchmark_test.go index 6ab04ca630fe..0c7df7e012b0 100644 --- a/x-pack/filebeat/input/awss3/input_benchmark_test.go +++ b/x-pack/filebeat/input/awss3/input_benchmark_test.go @@ -14,8 +14,6 @@ import ( "testing" "time" - "github.com/elastic/beats/v7/libbeat/beat" - "github.com/aws/aws-sdk-go-v2/aws" "github.com/aws/aws-sdk-go-v2/service/s3" @@ -24,6 +22,7 @@ import ( "github.com/olekukonko/tablewriter" "github.com/pkg/errors" + "github.com/elastic/beats/v7/libbeat/beat" "github.com/elastic/beats/v7/libbeat/common" "github.com/elastic/beats/v7/libbeat/logp" "github.com/elastic/beats/v7/libbeat/monitoring" @@ -33,7 +32,7 @@ import ( ) const cloudtrailTestFile = "testdata/aws-cloudtrail.json.gz" -const totListingObjects = 10000 +const totalListingObjects = 10000 type constantSQS struct { msgs []sqs.Message @@ -98,7 +97,7 @@ func newS3PagerConstant() *s3PagerConstant { currentIndex: 0, } - for i := 0; i < totListingObjects; i++ { + for i := 0; i < totalListingObjects; i++ { ret.objects = append(ret.objects, s3.Object{ Key: aws.String(fmt.Sprintf("key-%d.json.gz", i)), ETag: aws.String(fmt.Sprintf("etag-%d", i)), @@ -278,13 +277,13 @@ func benchmarkInputS3(t *testing.T, numberOfWorkers int) testing.BenchmarkResult } s3EventHandlerFactory := newS3ObjectProcessorFactory(log.Named("s3"), metrics, s3API, client, conf.FileSelectors) - s3Poller := newS3Poller(logp.NewLogger(inputName), metrics, s3API, s3EventHandlerFactory, newStates(), store, "bucket", numberOfWorkers, time.Second) + s3Poller := newS3Poller(logp.NewLogger(inputName), metrics, s3API, s3EventHandlerFactory, newStates(inputCtx), store, "bucket", numberOfWorkers, time.Second) ctx, cancel := context.WithCancel(context.Background()) b.Cleanup(cancel) go func() { - for metrics.s3ObjectsAckedTotal.Get() < totListingObjects { + for metrics.s3ObjectsAckedTotal.Get() < totalListingObjects { time.Sleep(5 * time.Millisecond) } cancel() diff --git a/x-pack/filebeat/input/awss3/input_integration_test.go b/x-pack/filebeat/input/awss3/input_integration_test.go index 7d83769a4f51..1ce157c553f8 100644 --- a/x-pack/filebeat/input/awss3/input_integration_test.go +++ b/x-pack/filebeat/input/awss3/input_integration_test.go @@ -75,8 +75,8 @@ func getTerraformOutputs(t *testing.T) terraformOutputData { func makeTestConfigS3(s3bucket string) *common.Config { return common.MustNewConfigFrom(fmt.Sprintf(`--- -s3_bucket: aws:s3:::%s -s3_bucket_number_of_workers: 1 +bucket: aws:s3:::%s +number_of_workers: 1 file_selectors: - regex: 'events-array.json$' diff --git a/x-pack/filebeat/input/awss3/metrics.go b/x-pack/filebeat/input/awss3/metrics.go index 6a7bd59599c4..045aa7ffe17b 100644 --- a/x-pack/filebeat/input/awss3/metrics.go +++ b/x-pack/filebeat/input/awss3/metrics.go @@ -25,9 +25,12 @@ type inputMetrics struct { sqsMessageProcessingTime metrics.Sample // Histogram of the elapsed SQS processing times in nanoseconds (time of receipt to time of delete/return). s3ObjectsRequestedTotal *monitoring.Uint // Number of S3 objects downloaded. - s3ObjectsAckedTotal *monitoring.Uint // Number of S3 objects fully ACKed. - s3ObjectsListedTotal *monitoring.Uint // Number of S3 objects listed. - s3ObjectsProcessedTotal *monitoring.Uint // Number of S3 objects processed. + // s3ObjectsListedTotal is the number of S3 objects processed that were fully ACKed. + s3ObjectsAckedTotal *monitoring.Uint // Number of S3 objects fully ACKed. + // s3ObjectsListedTotal is the number of S3 objects returned by list operations. + s3ObjectsListedTotal *monitoring.Uint + // s3ObjectsProcessedTotal is the number of S3 objects that matched file_selectors rules. + s3ObjectsProcessedTotal *monitoring.Uint s3BytesProcessedTotal *monitoring.Uint // Number of S3 bytes processed. s3EventsCreatedTotal *monitoring.Uint // Number of events created from processing S3 data. s3ObjectsInflight *monitoring.Uint // Number of S3 objects inflight (gauge). diff --git a/x-pack/filebeat/input/awss3/s3.go b/x-pack/filebeat/input/awss3/s3.go index 150b54444c95..4d2313d35a9b 100644 --- a/x-pack/filebeat/input/awss3/s3.go +++ b/x-pack/filebeat/input/awss3/s3.go @@ -143,9 +143,7 @@ processingLoop: } func (p *s3Poller) GetS3Objects(ctx context.Context, s3ObjectPayloadChan chan<- *s3ObjectPayload) { - defer func() { - close(s3ObjectPayloadChan) - }() + defer close(s3ObjectPayloadChan) bucketMetadata := strings.Split(p.bucket, ":") bucketName := bucketMetadata[len(bucketMetadata)-1] @@ -176,7 +174,7 @@ func (p *s3Poller) GetS3Objects(ctx context.Context, s3ObjectPayloadChan chan<- // Unescape s3 key name. For example, convert "%3D" back to "=". filename, err := url.QueryUnescape(*object.Key) if err != nil { - p.log.Errorw("Error when unescaping object key, skipping.", "error", err, "key", *object.Key) + p.log.Errorw("Error when unescaping object key, skipping.", "error", err, "s3_object", *object.Key) continue } @@ -264,7 +262,7 @@ func (p *s3Poller) Purge() { } var latestStoredTime time.Time - keys[state.Id] = struct{}{} + keys[state.ID] = struct{}{} latestStoredTime, ok := latestStoredTimeByBucket[state.Bucket] if !ok { var commitWriteState commitWriteState @@ -273,10 +271,9 @@ func (p *s3Poller) Purge() { // we have no entry in the map and we have no entry in the store // set zero time latestStoredTime = time.Time{} + } else { + latestStoredTime = commitWriteState.Time } - - latestStoredTime = commitWriteState.Time - } if state.LastModified.After(latestStoredTime) { @@ -289,7 +286,9 @@ func (p *s3Poller) Purge() { p.states.Delete(key) } - p.states.writeStates(p.store) + if err := p.states.writeStates(p.store); err != nil { + p.log.Errorw("Failed to write states to the registry", "error", err) + } for bucket, latestStoredTime := range latestStoredTimeByBucket { if err := p.store.Set(awsS3WriteCommitPrefix+bucket, commitWriteState{latestStoredTime}); err != nil { diff --git a/x-pack/filebeat/input/awss3/s3_test.go b/x-pack/filebeat/input/awss3/s3_test.go index bd3a544bd040..a02f3a58495c 100644 --- a/x-pack/filebeat/input/awss3/s3_test.go +++ b/x-pack/filebeat/input/awss3/s3_test.go @@ -133,7 +133,7 @@ func TestS3Poller(t *testing.T) { Return(nil, errFakeConnectivityFailure) s3ObjProc := newS3ObjectProcessorFactory(logp.NewLogger(inputName), nil, mockAPI, mockPublisher, nil) - receiver := newS3Poller(logp.NewLogger(inputName), nil, mockAPI, s3ObjProc, newStates(), store, bucket, numberOfWorkers, pollInterval) + receiver := newS3Poller(logp.NewLogger(inputName), nil, mockAPI, s3ObjProc, newStates(inputCtx), store, bucket, numberOfWorkers, pollInterval) require.Error(t, context.DeadlineExceeded, receiver.Poll(ctx)) assert.Equal(t, numberOfWorkers, receiver.workerSem.available) }) @@ -263,7 +263,7 @@ func TestS3Poller(t *testing.T) { Return(nil, errFakeConnectivityFailure) s3ObjProc := newS3ObjectProcessorFactory(logp.NewLogger(inputName), nil, mockAPI, mockPublisher, nil) - receiver := newS3Poller(logp.NewLogger(inputName), nil, mockAPI, s3ObjProc, newStates(), store, bucket, numberOfWorkers, pollInterval) + receiver := newS3Poller(logp.NewLogger(inputName), nil, mockAPI, s3ObjProc, newStates(inputCtx), store, bucket, numberOfWorkers, pollInterval) require.Error(t, context.DeadlineExceeded, receiver.Poll(ctx)) assert.Equal(t, numberOfWorkers, receiver.workerSem.available) }) diff --git a/x-pack/filebeat/input/awss3/state.go b/x-pack/filebeat/input/awss3/state.go index c772935a7eaa..11b20652bfd1 100644 --- a/x-pack/filebeat/input/awss3/state.go +++ b/x-pack/filebeat/input/awss3/state.go @@ -11,11 +11,14 @@ import ( // state is used to communicate the publishing state of a s3 object type state struct { - Id string `json:"id" struct:"id"` + // ID is used to identify the state in the store, and it is composed by + // Bucket + Key + Etag + LastModified.String(): changing this value or how it is + // composed will break backward compatibilities with entries already in the store. + ID string `json:"id" struct:"id"` Bucket string `json:"bucket" struct:"bucket"` Key string `json:"key" struct:"key"` Etag string `json:"etag" struct:"etag"` - LastModified time.Time `json:"last_modified" struct:"last_modifed"` + LastModified time.Time `json:"last_modified" struct:"last_modified"` // A state has Stored = true when all events are ACKed. Stored bool `json:"stored" struct:"stored"` @@ -34,7 +37,7 @@ func newState(bucket, key, etag string, lastModified time.Time) state { Error: false, } - s.Id = s.Bucket + s.Key + s.Etag + s.LastModified.String() + s.ID = s.Bucket + s.Key + s.Etag + s.LastModified.String() return s } @@ -44,7 +47,7 @@ func (s *state) MarkAsStored() { s.Stored = true } -// MarkAsStored set the error flag to true +// MarkAsError set the error flag to true func (s *state) MarkAsError() { s.Error = true } @@ -63,8 +66,8 @@ func (s *state) IsEmpty() bool { // String returns string representation of the struct func (s *state) String() string { return fmt.Sprintf( - "{Id: %v, Bucket: %v, Key: %v, Etag: %v, LastModified: %v}", - s.Id, + "{ID: %v, Bucket: %v, Key: %v, Etag: %v, LastModified: %v}", + s.ID, s.Bucket, s.Key, s.Etag, diff --git a/x-pack/filebeat/input/awss3/states.go b/x-pack/filebeat/input/awss3/states.go index 154e4a5490f3..6674ee104c10 100644 --- a/x-pack/filebeat/input/awss3/states.go +++ b/x-pack/filebeat/input/awss3/states.go @@ -8,9 +8,11 @@ import ( "strings" "sync" - "github.com/elastic/beats/v7/libbeat/statestore" + v2 "github.com/elastic/beats/v7/filebeat/input/v2" "github.com/elastic/beats/v7/libbeat/logp" + + "github.com/elastic/beats/v7/libbeat/statestore" ) const ( @@ -30,6 +32,8 @@ type listingInfo struct { type states struct { sync.RWMutex + log *logp.Logger + // states store states []state @@ -42,8 +46,9 @@ type states struct { } // newStates generates a new states registry. -func newStates() *states { +func newStates(ctx v2.Context) *states { return &states{ + log: ctx.Logger.Named("states"), states: nil, idx: map[string]int{}, listingInfo: new(sync.Map), @@ -59,7 +64,7 @@ func (s *states) MustSkip(state state, store *statestore.Store) bool { previousState := s.FindPrevious(state) - // status is forget. if there is no previous state and + // status is forgotten. if there is no previous state and // the state.LastModified is before the last cleanStore // write commit we can remove var commitWriteState commitWriteState @@ -90,7 +95,7 @@ func (s *states) Delete(id string) { s.idx = map[string]int{} for i, state := range s.states { - s.idx[state.Id] = i + s.idx[state.ID] = i } } } @@ -139,7 +144,7 @@ func (s *states) Update(newState state, listingID string) { // No existing state found, add new one s.idx[id] = len(s.states) s.states = append(s.states, newState) - logp.Debug("input", "New state added for %s", newState.Id) + s.log.Debug("input", "New state added for %s", newState.ID) } if listingID == "" || (!newState.Stored && !newState.Error) { @@ -256,7 +261,7 @@ func (s *states) readStatesFrom(store *statestore.Store) error { return true, nil } - // try to decode. Ingore faulty/incompatible values. + // try to decode. Ignore faulty/incompatible values. var st state if err := dec.Decode(&st); err != nil { // XXX: Do we want to log here? In case we start to store other @@ -265,7 +270,7 @@ func (s *states) readStatesFrom(store *statestore.Store) error { return true, nil } - st.Id = key[len(awsS3ObjectStatePrefix):] + st.ID = key[len(awsS3ObjectStatePrefix):] states = append(states, st) return true, nil }) @@ -295,9 +300,9 @@ func fixStates(states []state) []state { for i := range states { state := &states[i] - old, exists := idx[state.Id] + old, exists := idx[state.ID] if !exists { - idx[state.Id] = state + idx[state.ID] = state } else { mergeStates(old, state) // overwrite the entry in 'old' } @@ -328,7 +333,7 @@ func mergeStates(st, other *state) { func (s *states) writeStates(store *statestore.Store) error { for _, state := range s.GetStates() { - key := awsS3ObjectStatePrefix + state.Id + key := awsS3ObjectStatePrefix + state.ID if err := store.Set(key, state); err != nil { return err } diff --git a/x-pack/filebeat/input/awss3/states_test.go b/x-pack/filebeat/input/awss3/states_test.go index 952bc98e9b76..c99e7c0889b3 100644 --- a/x-pack/filebeat/input/awss3/states_test.go +++ b/x-pack/filebeat/input/awss3/states_test.go @@ -5,12 +5,21 @@ package awss3 import ( + "context" "testing" "time" "github.com/stretchr/testify/assert" + + v2 "github.com/elastic/beats/v7/filebeat/input/v2" + "github.com/elastic/beats/v7/libbeat/logp" ) +var inputCtx = v2.Context{ + Logger: logp.NewLogger("test"), + Cancelation: context.Background(), +} + func TestStatesDelete(t *testing.T) { type stateTestCase struct { states func() *states @@ -22,21 +31,21 @@ func TestStatesDelete(t *testing.T) { tests := map[string]stateTestCase{ "delete empty states": { states: func() *states { - return newStates() + return newStates(inputCtx) }, deleteID: "an id", expected: []state{}, }, "delete not existing state": { states: func() *states { - states := newStates() + states := newStates(inputCtx) states.Update(newState("bucket", "key", "etag", lastModified), "") return states }, deleteID: "an id", expected: []state{ { - Id: "bucketkeyetag" + lastModified.String(), + ID: "bucketkeyetag" + lastModified.String(), Bucket: "bucket", Key: "key", Etag: "etag", @@ -46,7 +55,7 @@ func TestStatesDelete(t *testing.T) { }, "delete only one existing": { states: func() *states { - states := newStates() + states := newStates(inputCtx) states.Update(newState("bucket", "key", "etag", lastModified), "") return states }, @@ -55,7 +64,7 @@ func TestStatesDelete(t *testing.T) { }, "delete first": { states: func() *states { - states := newStates() + states := newStates(inputCtx) states.Update(newState("bucket", "key1", "etag1", lastModified), "") states.Update(newState("bucket", "key2", "etag2", lastModified), "") states.Update(newState("bucket", "key3", "etag3", lastModified), "") @@ -64,14 +73,14 @@ func TestStatesDelete(t *testing.T) { deleteID: "bucketkey1", expected: []state{ { - Id: "bucketkey3etag3" + lastModified.String(), + ID: "bucketkey3etag3" + lastModified.String(), Bucket: "bucket", Key: "key3", Etag: "etag3", LastModified: lastModified, }, { - Id: "bucketkey2etag2" + lastModified.String(), + ID: "bucketkey2etag2" + lastModified.String(), Bucket: "bucket", Key: "key2", Etag: "etag2", @@ -81,7 +90,7 @@ func TestStatesDelete(t *testing.T) { }, "delete last": { states: func() *states { - states := newStates() + states := newStates(inputCtx) states.Update(newState("bucket", "key1", "etag1", lastModified), "") states.Update(newState("bucket", "key2", "etag2", lastModified), "") states.Update(newState("bucket", "key3", "etag3", lastModified), "") @@ -90,14 +99,14 @@ func TestStatesDelete(t *testing.T) { deleteID: "bucketkey3", expected: []state{ { - Id: "bucketkey1etag1" + lastModified.String(), + ID: "bucketkey1etag1" + lastModified.String(), Bucket: "bucket", Key: "key1", Etag: "etag1", LastModified: lastModified, }, { - Id: "bucketkey2etag2" + lastModified.String(), + ID: "bucketkey2etag2" + lastModified.String(), Bucket: "bucket", Key: "key2", Etag: "etag2", @@ -107,7 +116,7 @@ func TestStatesDelete(t *testing.T) { }, "delete any": { states: func() *states { - states := newStates() + states := newStates(inputCtx) states.Update(newState("bucket", "key1", "etag1", lastModified), "") states.Update(newState("bucket", "key2", "etag2", lastModified), "") states.Update(newState("bucket", "key3", "etag3", lastModified), "") @@ -116,14 +125,14 @@ func TestStatesDelete(t *testing.T) { deleteID: "bucketkey2", expected: []state{ { - Id: "bucketkey1etag1" + lastModified.String(), + ID: "bucketkey1etag1" + lastModified.String(), Bucket: "bucket", Key: "key1", Etag: "etag1", LastModified: lastModified, }, { - Id: "bucketkey3etag3" + lastModified.String(), + ID: "bucketkey3etag3" + lastModified.String(), Bucket: "bucket", Key: "key3", Etag: "etag3", diff --git a/x-pack/filebeat/module/aws/_meta/docs.asciidoc b/x-pack/filebeat/module/aws/_meta/docs.asciidoc index a3bb06ee12f0..ebfdf4703bf5 100644 --- a/x-pack/filebeat/module/aws/_meta/docs.asciidoc +++ b/x-pack/filebeat/module/aws/_meta/docs.asciidoc @@ -44,8 +44,9 @@ Example config: cloudtrail: enabled: false #var.queue_url: https://sqs.myregion.amazonaws.com/123456/myqueue - #var.s3_bucket: 'arn:aws:s3:::mybucket - #var.s3_bucket_poll_interval: 300s + #var.bucket: 'arn:aws:s3:::mybucket + #var.bucket_list_interval: 300s + #var.number_of_workers: 5 #var.shared_credential_file: /etc/filebeat/aws_credentials #var.credential_profile_name: fb-aws #var.access_key_id: access_key_id @@ -60,8 +61,9 @@ Example config: cloudwatch: enabled: false #var.queue_url: https://sqs.myregion.amazonaws.com/123456/myqueue - #var.s3_bucket: 'arn:aws:s3:::mybucket - #var.s3_bucket_poll_interval: 300s + #var.bucket: 'arn:aws:s3:::mybucket + #var.bucket_list_interval: 300s + #var.number_of_workers: 5 #var.shared_credential_file: /etc/filebeat/aws_credentials #var.credential_profile_name: fb-aws #var.access_key_id: access_key_id @@ -76,8 +78,9 @@ Example config: ec2: enabled: false #var.queue_url: https://sqs.myregion.amazonaws.com/123456/myqueue - #var.s3_bucket: 'arn:aws:s3:::mybucket - #var.s3_bucket_poll_interval: 300s + #var.bucket: 'arn:aws:s3:::mybucket + #var.bucket_list_interval: 300s + #var.number_of_workers: 5 #var.shared_credential_file: /etc/filebeat/aws_credentials #var.credential_profile_name: fb-aws #var.access_key_id: access_key_id @@ -92,8 +95,9 @@ Example config: elb: enabled: false #var.queue_url: https://sqs.myregion.amazonaws.com/123456/myqueue - #var.s3_bucket: 'arn:aws:s3:::mybucket - #var.s3_bucket_poll_interval: 300s + #var.bucket: 'arn:aws:s3:::mybucket + #var.bucket_list_interval: 300s + #var.number_of_workers: 5 #var.shared_credential_file: /etc/filebeat/aws_credentials #var.credential_profile_name: fb-aws #var.access_key_id: access_key_id @@ -108,8 +112,9 @@ Example config: s3access: enabled: false #var.queue_url: https://sqs.myregion.amazonaws.com/123456/myqueue - #var.s3_bucket: 'arn:aws:s3:::mybucket - #var.s3_bucket_poll_interval: 300s + #var.bucket: 'arn:aws:s3:::mybucket + #var.bucket_list_interval: 300s + #var.number_of_workers: 5 #var.shared_credential_file: /etc/filebeat/aws_credentials #var.credential_profile_name: fb-aws #var.access_key_id: access_key_id @@ -124,8 +129,9 @@ Example config: vpcflow: enabled: false #var.queue_url: https://sqs.myregion.amazonaws.com/123456/myqueue - #var.s3_bucket: 'arn:aws:s3:::mybucket - #var.s3_bucket_poll_interval: 300s + #var.bucket: 'arn:aws:s3:::mybucket + #var.bucket_list_interval: 300s + #var.number_of_workers: 5 #var.shared_credential_file: /etc/filebeat/aws_credentials #var.credential_profile_name: fb-aws #var.access_key_id: access_key_id @@ -140,7 +146,7 @@ Example config: *`var.queue_url`*:: -AWS SQS queue url (Required when `var.s3_bucket` is not set). +AWS SQS queue url (Required when `var.bucket` is not set). *`var.visibility_timeout`*:: @@ -151,18 +157,18 @@ Default to be 300 seconds. Maximum duration before AWS API request will be interrupted. Default to be 120 seconds. -*`var.s3_bucket`*:: +*`var.bucket`*:: AWS S3 bucket ARN (Required when `var.queue_url` is not set). -*`var.s3_bucket_number_of_workers`*:: +*`var.number_of_workers`*:: -Number of workers that will process the S3 objects listed (Required when `var.s3_bucket` is set). +Number of workers that will process the S3 objects listed (Required when `var.bucket` is set). Use to vertically scale the input. -*`var.s3_bucket_poll_interval`*:: +*`var.bucket_list_interval`*:: -Interval between list requests to the S3 bucket. Default to be 120 seconds. +Wait interval between completion of a list request to the S3 bucket and beginning of the nest one. Default to be 120 seconds. *`var.endpoint`*:: diff --git a/x-pack/filebeat/module/aws/cloudtrail/config/aws-s3.yml b/x-pack/filebeat/module/aws/cloudtrail/config/aws-s3.yml index 17dd64a99fc5..78f068225906 100644 --- a/x-pack/filebeat/module/aws/cloudtrail/config/aws-s3.yml +++ b/x-pack/filebeat/module/aws/cloudtrail/config/aws-s3.yml @@ -2,16 +2,16 @@ type: aws-s3 {{ if .queue_url }} queue_url: {{ .queue_url }} {{ end }} -{{ if .s3_bucket }} -s3_bucket: {{ .s3_bucket }} +{{ if .bucket }} +bucket: {{ .bucket }} {{ end }} -{{ if .s3_bucket_number_of_workers }} -s3_bucket_number_of_workers: {{ .s3_bucket_number_of_workers }} +{{ if .number_of_workers }} +number_of_workers: {{ .number_of_workers }} {{ end }} -{{ if .s3_bucket_poll_interval }} -s3_bucket_poll_interval: {{ .s3_bucket_poll_interval }} +{{ if .bucket_list_interval }} +bucket_list_interval: {{ .bucket_list_interval }} {{ end }} file_selectors: {{ if .process_cloudtrail_logs }} diff --git a/x-pack/filebeat/module/aws/cloudtrail/manifest.yml b/x-pack/filebeat/module/aws/cloudtrail/manifest.yml index d06d1b0b6774..9d40124d846a 100644 --- a/x-pack/filebeat/module/aws/cloudtrail/manifest.yml +++ b/x-pack/filebeat/module/aws/cloudtrail/manifest.yml @@ -4,9 +4,9 @@ var: - name: input default: aws-s3 - name: queue_url - - name: s3_bucket - - name: s3_bucket_number_of_workers - - name: s3_bucket_poll_interval + - name: bucket + - name: number_of_workers + - name: bucket_list_interval - name: shared_credential_file - name: credential_profile_name - name: visibility_timeout diff --git a/x-pack/filebeat/module/aws/cloudwatch/config/aws-s3.yml b/x-pack/filebeat/module/aws/cloudwatch/config/aws-s3.yml index 2f7a694382aa..e960abcffe5f 100644 --- a/x-pack/filebeat/module/aws/cloudwatch/config/aws-s3.yml +++ b/x-pack/filebeat/module/aws/cloudwatch/config/aws-s3.yml @@ -2,16 +2,16 @@ type: aws-s3 {{ if .queue_url }} queue_url: {{ .queue_url }} {{ end }} -{{ if .s3_bucket }} -s3_bucket: {{ .s3_bucket }} +{{ if .bucket }} +bucket: {{ .bucket }} {{ end }} -{{ if .s3_bucket_number_of_workers }} -s3_bucket_number_of_workers: {{ .s3_bucket_number_of_workers }} +{{ if .number_of_workers }} +number_of_workers: {{ .number_of_workers }} {{ end }} -{{ if .s3_bucket_poll_interval }} -s3_bucket_poll_interval: {{ .s3_bucket_poll_interval }} +{{ if .bucket_list_interval }} +bucket_list_interval: {{ .bucket_list_interval }} {{ end }} {{ if .credential_profile_name }} diff --git a/x-pack/filebeat/module/aws/cloudwatch/manifest.yml b/x-pack/filebeat/module/aws/cloudwatch/manifest.yml index 275cae461146..415d3f42718b 100644 --- a/x-pack/filebeat/module/aws/cloudwatch/manifest.yml +++ b/x-pack/filebeat/module/aws/cloudwatch/manifest.yml @@ -4,9 +4,9 @@ var: - name: input default: aws-s3 - name: queue_url - - name: s3_bucket - - name: s3_bucket_number_of_workers - - name: s3_bucket_poll_interval + - name: bucket + - name: number_of_workers + - name: bucket_list_interval - name: shared_credential_file - name: credential_profile_name - name: visibility_timeout diff --git a/x-pack/filebeat/module/aws/ec2/config/aws-s3.yml b/x-pack/filebeat/module/aws/ec2/config/aws-s3.yml index 2f7a694382aa..e960abcffe5f 100644 --- a/x-pack/filebeat/module/aws/ec2/config/aws-s3.yml +++ b/x-pack/filebeat/module/aws/ec2/config/aws-s3.yml @@ -2,16 +2,16 @@ type: aws-s3 {{ if .queue_url }} queue_url: {{ .queue_url }} {{ end }} -{{ if .s3_bucket }} -s3_bucket: {{ .s3_bucket }} +{{ if .bucket }} +bucket: {{ .bucket }} {{ end }} -{{ if .s3_bucket_number_of_workers }} -s3_bucket_number_of_workers: {{ .s3_bucket_number_of_workers }} +{{ if .number_of_workers }} +number_of_workers: {{ .number_of_workers }} {{ end }} -{{ if .s3_bucket_poll_interval }} -s3_bucket_poll_interval: {{ .s3_bucket_poll_interval }} +{{ if .bucket_list_interval }} +bucket_list_interval: {{ .bucket_list_interval }} {{ end }} {{ if .credential_profile_name }} diff --git a/x-pack/filebeat/module/aws/ec2/manifest.yml b/x-pack/filebeat/module/aws/ec2/manifest.yml index 275cae461146..415d3f42718b 100644 --- a/x-pack/filebeat/module/aws/ec2/manifest.yml +++ b/x-pack/filebeat/module/aws/ec2/manifest.yml @@ -4,9 +4,9 @@ var: - name: input default: aws-s3 - name: queue_url - - name: s3_bucket - - name: s3_bucket_number_of_workers - - name: s3_bucket_poll_interval + - name: bucket + - name: number_of_workers + - name: bucket_list_interval - name: shared_credential_file - name: credential_profile_name - name: visibility_timeout diff --git a/x-pack/filebeat/module/aws/elb/config/aws-s3.yml b/x-pack/filebeat/module/aws/elb/config/aws-s3.yml index 2f7a694382aa..e960abcffe5f 100644 --- a/x-pack/filebeat/module/aws/elb/config/aws-s3.yml +++ b/x-pack/filebeat/module/aws/elb/config/aws-s3.yml @@ -2,16 +2,16 @@ type: aws-s3 {{ if .queue_url }} queue_url: {{ .queue_url }} {{ end }} -{{ if .s3_bucket }} -s3_bucket: {{ .s3_bucket }} +{{ if .bucket }} +bucket: {{ .bucket }} {{ end }} -{{ if .s3_bucket_number_of_workers }} -s3_bucket_number_of_workers: {{ .s3_bucket_number_of_workers }} +{{ if .number_of_workers }} +number_of_workers: {{ .number_of_workers }} {{ end }} -{{ if .s3_bucket_poll_interval }} -s3_bucket_poll_interval: {{ .s3_bucket_poll_interval }} +{{ if .bucket_list_interval }} +bucket_list_interval: {{ .bucket_list_interval }} {{ end }} {{ if .credential_profile_name }} diff --git a/x-pack/filebeat/module/aws/elb/manifest.yml b/x-pack/filebeat/module/aws/elb/manifest.yml index 2392b96ecf71..128fcbf735e7 100644 --- a/x-pack/filebeat/module/aws/elb/manifest.yml +++ b/x-pack/filebeat/module/aws/elb/manifest.yml @@ -4,9 +4,9 @@ var: - name: input default: aws-s3 - name: queue_url - - name: s3_bucket - - name: s3_bucket_number_of_workers - - name: s3_bucket_poll_interval + - name: bucket + - name: number_of_workers + - name: bucket_list_interval - name: shared_credential_file - name: credential_profile_name - name: visibility_timeout diff --git a/x-pack/filebeat/module/aws/s3access/config/aws-s3.yml b/x-pack/filebeat/module/aws/s3access/config/aws-s3.yml index 2f7a694382aa..e960abcffe5f 100644 --- a/x-pack/filebeat/module/aws/s3access/config/aws-s3.yml +++ b/x-pack/filebeat/module/aws/s3access/config/aws-s3.yml @@ -2,16 +2,16 @@ type: aws-s3 {{ if .queue_url }} queue_url: {{ .queue_url }} {{ end }} -{{ if .s3_bucket }} -s3_bucket: {{ .s3_bucket }} +{{ if .bucket }} +bucket: {{ .bucket }} {{ end }} -{{ if .s3_bucket_number_of_workers }} -s3_bucket_number_of_workers: {{ .s3_bucket_number_of_workers }} +{{ if .number_of_workers }} +number_of_workers: {{ .number_of_workers }} {{ end }} -{{ if .s3_bucket_poll_interval }} -s3_bucket_poll_interval: {{ .s3_bucket_poll_interval }} +{{ if .bucket_list_interval }} +bucket_list_interval: {{ .bucket_list_interval }} {{ end }} {{ if .credential_profile_name }} diff --git a/x-pack/filebeat/module/aws/s3access/manifest.yml b/x-pack/filebeat/module/aws/s3access/manifest.yml index 275cae461146..415d3f42718b 100644 --- a/x-pack/filebeat/module/aws/s3access/manifest.yml +++ b/x-pack/filebeat/module/aws/s3access/manifest.yml @@ -4,9 +4,9 @@ var: - name: input default: aws-s3 - name: queue_url - - name: s3_bucket - - name: s3_bucket_number_of_workers - - name: s3_bucket_poll_interval + - name: bucket + - name: number_of_workers + - name: bucket_list_interval - name: shared_credential_file - name: credential_profile_name - name: visibility_timeout diff --git a/x-pack/filebeat/module/aws/vpcflow/config/input.yml b/x-pack/filebeat/module/aws/vpcflow/config/input.yml index 5a488669b9a8..f11ffcbc1e0f 100644 --- a/x-pack/filebeat/module/aws/vpcflow/config/input.yml +++ b/x-pack/filebeat/module/aws/vpcflow/config/input.yml @@ -4,16 +4,16 @@ type: aws-s3 {{ if .queue_url }} queue_url: {{ .queue_url }} {{ end }} -{{ if .s3_bucket }} -s3_bucket: {{ .s3_bucket }} +{{ if .bucket }} +bucket: {{ .bucket }} {{ end }} -{{ if .s3_bucket_number_of_workers }} -s3_bucket_number_of_workers: {{ .s3_bucket_number_of_workers }} +{{ if .number_of_workers }} +number_of_workers: {{ .number_of_workers }} {{ end }} -{{ if .s3_bucket_poll_interval }} -s3_bucket_poll_interval: {{ .s3_bucket_poll_interval }} +{{ if .bucket_list_interval }} +bucket_list_interval: {{ .bucket_list_interval }} {{ end }} {{ if .credential_profile_name }} diff --git a/x-pack/filebeat/module/aws/vpcflow/manifest.yml b/x-pack/filebeat/module/aws/vpcflow/manifest.yml index bcfeb3132858..d3122493b8cd 100644 --- a/x-pack/filebeat/module/aws/vpcflow/manifest.yml +++ b/x-pack/filebeat/module/aws/vpcflow/manifest.yml @@ -4,9 +4,9 @@ var: - name: input default: aws-s3 - name: queue_url - - name: s3_bucket - - name: s3_bucket_number_of_workers - - name: s3_bucket_poll_interval + - name: bucket + - name: number_of_workers + - name: bucket_list_interval - name: shared_credential_file - name: credential_profile_name - name: visibility_timeout From 42284d068988f12b846e0d1c3a7b2b594a35661c Mon Sep 17 00:00:00 2001 From: Andrea Spacca Date: Mon, 16 Aug 2021 11:55:44 +0200 Subject: [PATCH 15/20] make update --- filebeat/docs/modules/aws.asciidoc | 26 ++++++++++++++++---------- 1 file changed, 16 insertions(+), 10 deletions(-) diff --git a/filebeat/docs/modules/aws.asciidoc b/filebeat/docs/modules/aws.asciidoc index d0564b71af42..94c62f344f78 100644 --- a/filebeat/docs/modules/aws.asciidoc +++ b/filebeat/docs/modules/aws.asciidoc @@ -49,8 +49,9 @@ Example config: cloudtrail: enabled: false #var.queue_url: https://sqs.myregion.amazonaws.com/123456/myqueue - #var.s3_bucket: 'arn:aws:s3:::mybucket + #var.bucket: 'arn:aws:s3:::mybucket #var.bucket_list_interval: 300s + #var.number_of_workers: 5 #var.shared_credential_file: /etc/filebeat/aws_credentials #var.credential_profile_name: fb-aws #var.access_key_id: access_key_id @@ -65,8 +66,9 @@ Example config: cloudwatch: enabled: false #var.queue_url: https://sqs.myregion.amazonaws.com/123456/myqueue - #var.s3_bucket: 'arn:aws:s3:::mybucket + #var.bucket: 'arn:aws:s3:::mybucket #var.bucket_list_interval: 300s + #var.number_of_workers: 5 #var.shared_credential_file: /etc/filebeat/aws_credentials #var.credential_profile_name: fb-aws #var.access_key_id: access_key_id @@ -81,8 +83,9 @@ Example config: ec2: enabled: false #var.queue_url: https://sqs.myregion.amazonaws.com/123456/myqueue - #var.s3_bucket: 'arn:aws:s3:::mybucket + #var.bucket: 'arn:aws:s3:::mybucket #var.bucket_list_interval: 300s + #var.number_of_workers: 5 #var.shared_credential_file: /etc/filebeat/aws_credentials #var.credential_profile_name: fb-aws #var.access_key_id: access_key_id @@ -97,8 +100,9 @@ Example config: elb: enabled: false #var.queue_url: https://sqs.myregion.amazonaws.com/123456/myqueue - #var.s3_bucket: 'arn:aws:s3:::mybucket + #var.bucket: 'arn:aws:s3:::mybucket #var.bucket_list_interval: 300s + #var.number_of_workers: 5 #var.shared_credential_file: /etc/filebeat/aws_credentials #var.credential_profile_name: fb-aws #var.access_key_id: access_key_id @@ -113,8 +117,9 @@ Example config: s3access: enabled: false #var.queue_url: https://sqs.myregion.amazonaws.com/123456/myqueue - #var.s3_bucket: 'arn:aws:s3:::mybucket + #var.bucket: 'arn:aws:s3:::mybucket #var.bucket_list_interval: 300s + #var.number_of_workers: 5 #var.shared_credential_file: /etc/filebeat/aws_credentials #var.credential_profile_name: fb-aws #var.access_key_id: access_key_id @@ -129,8 +134,9 @@ Example config: vpcflow: enabled: false #var.queue_url: https://sqs.myregion.amazonaws.com/123456/myqueue - #var.s3_bucket: 'arn:aws:s3:::mybucket + #var.bucket: 'arn:aws:s3:::mybucket #var.bucket_list_interval: 300s + #var.number_of_workers: 5 #var.shared_credential_file: /etc/filebeat/aws_credentials #var.credential_profile_name: fb-aws #var.access_key_id: access_key_id @@ -145,7 +151,7 @@ Example config: *`var.queue_url`*:: -AWS SQS queue url (Required when `var.s3_bucket` is not set). +AWS SQS queue url (Required when `var.bucket` is not set). *`var.visibility_timeout`*:: @@ -156,18 +162,18 @@ Default to be 300 seconds. Maximum duration before AWS API request will be interrupted. Default to be 120 seconds. -*`var.s3_bucket`*:: +*`var.bucket`*:: AWS S3 bucket ARN (Required when `var.queue_url` is not set). *`var.number_of_workers`*:: -Number of workers that will process the S3 objects listed (Required when `var.s3_bucket` is set). +Number of workers that will process the S3 objects listed (Required when `var.bucket` is set). Use to vertically scale the input. *`var.bucket_list_interval`*:: -Interval between list requests to the S3 bucket. Default to be 120 seconds. +Wait interval between completion of a list request to the S3 bucket and beginning of the nest one. Default to be 120 seconds. *`var.endpoint`*:: From c99c412f0087bf3ae97657be4b23f3b43c84a3fc Mon Sep 17 00:00:00 2001 From: Andrea Spacca Date: Mon, 16 Aug 2021 12:54:24 +0200 Subject: [PATCH 16/20] fix config test --- x-pack/filebeat/input/awss3/config.go | 4 ++-- x-pack/filebeat/input/awss3/config_test.go | 16 ++++++++-------- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/x-pack/filebeat/input/awss3/config.go b/x-pack/filebeat/input/awss3/config.go index 3719d4f344d8..9fef4a9fc600 100644 --- a/x-pack/filebeat/input/awss3/config.go +++ b/x-pack/filebeat/input/awss3/config.go @@ -50,11 +50,11 @@ func defaultConfig() config { func (c *config) Validate() error { if c.QueueURL == "" && c.Bucket == "" { - return fmt.Errorf("queue_url or s3_bucket must provided") + return fmt.Errorf("queue_url or bucket must provided") } if c.QueueURL != "" && c.Bucket != "" { - return fmt.Errorf("queue_url <%v> and s3_bucket <%v> "+ + return fmt.Errorf("queue_url <%v> and bucket <%v> "+ "cannot be set at the same time", c.QueueURL, c.Bucket) } diff --git a/x-pack/filebeat/input/awss3/config_test.go b/x-pack/filebeat/input/awss3/config_test.go index 57b38987aff5..c0fd94e2aee6 100644 --- a/x-pack/filebeat/input/awss3/config_test.go +++ b/x-pack/filebeat/input/awss3/config_test.go @@ -69,7 +69,7 @@ func TestConfig(t *testing.T) { "", s3Bucket, common.MapStr{ - "s3_bucket": s3Bucket, + "bucket": s3Bucket, "number_of_workers": 5, }, "", @@ -110,9 +110,9 @@ func TestConfig(t *testing.T) { "", common.MapStr{ "queue_url": "", - "s3_bucket": "", + "bucket": "", }, - "queue_url or s3_bucket must provided", + "queue_url or bucket must provided", nil, }, { @@ -121,9 +121,9 @@ func TestConfig(t *testing.T) { s3Bucket, common.MapStr{ "queue_url": queueURL, - "s3_bucket": s3Bucket, + "bucket": s3Bucket, }, - "queue_url and s3_bucket cannot be set at the same time", + "queue_url and bucket cannot be set at the same time", nil, }, { @@ -164,7 +164,7 @@ func TestConfig(t *testing.T) { "", s3Bucket, common.MapStr{ - "s3_bucket": s3Bucket, + "bucket": s3Bucket, "bucket_list_interval": "0", }, "bucket_list_interval <0s> must be greater than 0", @@ -175,7 +175,7 @@ func TestConfig(t *testing.T) { "", s3Bucket, common.MapStr{ - "s3_bucket": s3Bucket, + "bucket": s3Bucket, "number_of_workers": "0", }, "number_of_workers <0> must be greater than 0", @@ -231,7 +231,7 @@ func TestConfig(t *testing.T) { "", s3Bucket, common.MapStr{ - "s3_bucket": s3Bucket, + "bucket": s3Bucket, "expand_event_list_from_field": "Records", "content_type": "text/plain", }, From e9049e6413038f0f03f9319b3830f311f9deac0c Mon Sep 17 00:00:00 2001 From: Andrea Spacca Date: Mon, 16 Aug 2021 13:12:51 +0200 Subject: [PATCH 17/20] cr fix --- x-pack/filebeat/input/awss3/s3.go | 43 ++++++++++++------------------- 1 file changed, 17 insertions(+), 26 deletions(-) diff --git a/x-pack/filebeat/input/awss3/s3.go b/x-pack/filebeat/input/awss3/s3.go index 4d2313d35a9b..f1c08d2fb81f 100644 --- a/x-pack/filebeat/input/awss3/s3.go +++ b/x-pack/filebeat/input/awss3/s3.go @@ -105,38 +105,29 @@ func (p *s3Poller) handlePurgingLock(info s3ObjectInfo, isStored bool) { func (p *s3Poller) ProcessObject(s3ObjectPayloadChan <-chan *s3ObjectPayload) error { var errs []error -processingLoop: - for { - select { - case s3ObjectPayload := <-s3ObjectPayloadChan: - if s3ObjectPayload == nil { - break processingLoop - } - - // Process S3 object (download, parse, create events). - err := s3ObjectPayload.s3ObjectHandler.ProcessS3Object() - - // Wait for all events to be ACKed before proceeding. - s3ObjectPayload.s3ObjectHandler.Wait() + for s3ObjectPayload := range s3ObjectPayloadChan { + // Process S3 object (download, parse, create events). + err := s3ObjectPayload.s3ObjectHandler.ProcessS3Object() - info := s3ObjectPayload.s3ObjectInfo + // Wait for all events to be ACKed before proceeding. + s3ObjectPayload.s3ObjectHandler.Wait() - if err != nil { - event := s3ObjectPayload.s3ObjectEvent - errs = append(errs, errors.Wrapf(err, - "failed processing S3 event for object key %q in bucket %q", - event.S3.Object.Key, event.S3.Bucket.Name)) + info := s3ObjectPayload.s3ObjectInfo - p.handlePurgingLock(info, false) - continue + if err != nil { + event := s3ObjectPayload.s3ObjectEvent + errs = append(errs, errors.Wrapf(err, + "failed processing S3 event for object key %q in bucket %q", + event.S3.Object.Key, event.S3.Bucket.Name)) - } + p.handlePurgingLock(info, false) + continue + } - p.handlePurgingLock(info, true) + p.handlePurgingLock(info, true) - // Metrics - p.metrics.s3ObjectsAckedTotal.Inc() - } + // Metrics + p.metrics.s3ObjectsAckedTotal.Inc() } return multierr.Combine(errs...) From cf6c277caf5f00d68976e5a459a348cb138f59b3 Mon Sep 17 00:00:00 2001 From: Andrea Spacca Date: Mon, 16 Aug 2021 13:20:33 +0200 Subject: [PATCH 18/20] eventACKTracke Add/Wait must be sequential --- x-pack/filebeat/input/awss3/acker.go | 4 ++++ x-pack/filebeat/input/awss3/acker_test.go | 15 +++++++++++++++ 2 files changed, 19 insertions(+) diff --git a/x-pack/filebeat/input/awss3/acker.go b/x-pack/filebeat/input/awss3/acker.go index 1ac45a8f38e1..ba80601997fe 100644 --- a/x-pack/filebeat/input/awss3/acker.go +++ b/x-pack/filebeat/input/awss3/acker.go @@ -51,6 +51,10 @@ func (a *eventACKTracker) ACK() { } // Wait waits for the number of pending ACKs to be zero. +// Wait must be called sequentially only after every expected +// Add call are made. Failing to do so could reset the pendingACKs +// property to 0 and would results in Wait returning after additional +// calls to `Add` are made without a corresponding `ACK` call. func (a *eventACKTracker) Wait() { // If there were never any pending ACKs then cancel the context. (This can // happen when a document contains no events or cannot be read due to an error). diff --git a/x-pack/filebeat/input/awss3/acker_test.go b/x-pack/filebeat/input/awss3/acker_test.go index 9234479e9850..a038e8a39e44 100644 --- a/x-pack/filebeat/input/awss3/acker_test.go +++ b/x-pack/filebeat/input/awss3/acker_test.go @@ -52,3 +52,18 @@ func TestEventACKHandler(t *testing.T) { assert.EqualValues(t, 0, acker.pendingACKs) assert.ErrorIs(t, acker.ctx.Err(), context.Canceled) } + +func TestEventACKHandlerWait(t *testing.T) { + ctx, cancel := context.WithCancel(context.Background()) + t.Cleanup(cancel) + + // Create acker. Add one pending ACK. + acker := newEventACKTracker(ctx) + acker.Add() + acker.ACK() + acker.Wait() + acker.Add() + + assert.EqualValues(t, 1, acker.pendingACKs) + assert.ErrorIs(t, acker.ctx.Err(), context.Canceled) +} From 5e8594e8bcd9f07e54cfc4ae8ab8662c1ecd3bb4 Mon Sep 17 00:00:00 2001 From: Andrea Spacca Date: Mon, 16 Aug 2021 18:40:16 +0200 Subject: [PATCH 19/20] cr fixes --- filebeat/docs/modules/aws.asciidoc | 2 +- x-pack/filebeat/input/awss3/metrics.go | 4 ++-- x-pack/filebeat/module/aws/_meta/docs.asciidoc | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/filebeat/docs/modules/aws.asciidoc b/filebeat/docs/modules/aws.asciidoc index 94c62f344f78..0a7db9c605d4 100644 --- a/filebeat/docs/modules/aws.asciidoc +++ b/filebeat/docs/modules/aws.asciidoc @@ -173,7 +173,7 @@ Use to vertically scale the input. *`var.bucket_list_interval`*:: -Wait interval between completion of a list request to the S3 bucket and beginning of the nest one. Default to be 120 seconds. +Wait interval between completion of a list request to the S3 bucket and beginning of the next one. Default to be 120 seconds. *`var.endpoint`*:: diff --git a/x-pack/filebeat/input/awss3/metrics.go b/x-pack/filebeat/input/awss3/metrics.go index 045aa7ffe17b..e20baf83c763 100644 --- a/x-pack/filebeat/input/awss3/metrics.go +++ b/x-pack/filebeat/input/awss3/metrics.go @@ -25,8 +25,8 @@ type inputMetrics struct { sqsMessageProcessingTime metrics.Sample // Histogram of the elapsed SQS processing times in nanoseconds (time of receipt to time of delete/return). s3ObjectsRequestedTotal *monitoring.Uint // Number of S3 objects downloaded. - // s3ObjectsListedTotal is the number of S3 objects processed that were fully ACKed. - s3ObjectsAckedTotal *monitoring.Uint // Number of S3 objects fully ACKed. + // s3ObjectsAckedTotal is the number of S3 objects processed that were fully ACKed. + s3ObjectsAckedTotal *monitoring.Uint // s3ObjectsListedTotal is the number of S3 objects returned by list operations. s3ObjectsListedTotal *monitoring.Uint // s3ObjectsProcessedTotal is the number of S3 objects that matched file_selectors rules. diff --git a/x-pack/filebeat/module/aws/_meta/docs.asciidoc b/x-pack/filebeat/module/aws/_meta/docs.asciidoc index ebfdf4703bf5..870385a7a6c2 100644 --- a/x-pack/filebeat/module/aws/_meta/docs.asciidoc +++ b/x-pack/filebeat/module/aws/_meta/docs.asciidoc @@ -168,7 +168,7 @@ Use to vertically scale the input. *`var.bucket_list_interval`*:: -Wait interval between completion of a list request to the S3 bucket and beginning of the nest one. Default to be 120 seconds. +Wait interval between completion of a list request to the S3 bucket and beginning of the next one. Default to be 120 seconds. *`var.endpoint`*:: From 4087528980e3256c21b177d8a423e4c26ab1aaa8 Mon Sep 17 00:00:00 2001 From: Andrea Spacca Date: Mon, 16 Aug 2021 18:53:58 +0200 Subject: [PATCH 20/20] fix quotes in docs --- filebeat/docs/modules/aws.asciidoc | 12 ++++++------ x-pack/filebeat/module/aws/_meta/docs.asciidoc | 12 ++++++------ 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/filebeat/docs/modules/aws.asciidoc b/filebeat/docs/modules/aws.asciidoc index 0a7db9c605d4..78cdc5c961cc 100644 --- a/filebeat/docs/modules/aws.asciidoc +++ b/filebeat/docs/modules/aws.asciidoc @@ -49,7 +49,7 @@ Example config: cloudtrail: enabled: false #var.queue_url: https://sqs.myregion.amazonaws.com/123456/myqueue - #var.bucket: 'arn:aws:s3:::mybucket + #var.bucket: 'arn:aws:s3:::mybucket' #var.bucket_list_interval: 300s #var.number_of_workers: 5 #var.shared_credential_file: /etc/filebeat/aws_credentials @@ -66,7 +66,7 @@ Example config: cloudwatch: enabled: false #var.queue_url: https://sqs.myregion.amazonaws.com/123456/myqueue - #var.bucket: 'arn:aws:s3:::mybucket + #var.bucket: 'arn:aws:s3:::mybucket' #var.bucket_list_interval: 300s #var.number_of_workers: 5 #var.shared_credential_file: /etc/filebeat/aws_credentials @@ -83,7 +83,7 @@ Example config: ec2: enabled: false #var.queue_url: https://sqs.myregion.amazonaws.com/123456/myqueue - #var.bucket: 'arn:aws:s3:::mybucket + #var.bucket: 'arn:aws:s3:::mybucket' #var.bucket_list_interval: 300s #var.number_of_workers: 5 #var.shared_credential_file: /etc/filebeat/aws_credentials @@ -100,7 +100,7 @@ Example config: elb: enabled: false #var.queue_url: https://sqs.myregion.amazonaws.com/123456/myqueue - #var.bucket: 'arn:aws:s3:::mybucket + #var.bucket: 'arn:aws:s3:::mybucket' #var.bucket_list_interval: 300s #var.number_of_workers: 5 #var.shared_credential_file: /etc/filebeat/aws_credentials @@ -117,7 +117,7 @@ Example config: s3access: enabled: false #var.queue_url: https://sqs.myregion.amazonaws.com/123456/myqueue - #var.bucket: 'arn:aws:s3:::mybucket + #var.bucket: 'arn:aws:s3:::mybucket' #var.bucket_list_interval: 300s #var.number_of_workers: 5 #var.shared_credential_file: /etc/filebeat/aws_credentials @@ -134,7 +134,7 @@ Example config: vpcflow: enabled: false #var.queue_url: https://sqs.myregion.amazonaws.com/123456/myqueue - #var.bucket: 'arn:aws:s3:::mybucket + #var.bucket: 'arn:aws:s3:::mybucket' #var.bucket_list_interval: 300s #var.number_of_workers: 5 #var.shared_credential_file: /etc/filebeat/aws_credentials diff --git a/x-pack/filebeat/module/aws/_meta/docs.asciidoc b/x-pack/filebeat/module/aws/_meta/docs.asciidoc index 870385a7a6c2..2e90084b1f05 100644 --- a/x-pack/filebeat/module/aws/_meta/docs.asciidoc +++ b/x-pack/filebeat/module/aws/_meta/docs.asciidoc @@ -44,7 +44,7 @@ Example config: cloudtrail: enabled: false #var.queue_url: https://sqs.myregion.amazonaws.com/123456/myqueue - #var.bucket: 'arn:aws:s3:::mybucket + #var.bucket: 'arn:aws:s3:::mybucket' #var.bucket_list_interval: 300s #var.number_of_workers: 5 #var.shared_credential_file: /etc/filebeat/aws_credentials @@ -61,7 +61,7 @@ Example config: cloudwatch: enabled: false #var.queue_url: https://sqs.myregion.amazonaws.com/123456/myqueue - #var.bucket: 'arn:aws:s3:::mybucket + #var.bucket: 'arn:aws:s3:::mybucket' #var.bucket_list_interval: 300s #var.number_of_workers: 5 #var.shared_credential_file: /etc/filebeat/aws_credentials @@ -78,7 +78,7 @@ Example config: ec2: enabled: false #var.queue_url: https://sqs.myregion.amazonaws.com/123456/myqueue - #var.bucket: 'arn:aws:s3:::mybucket + #var.bucket: 'arn:aws:s3:::mybucket' #var.bucket_list_interval: 300s #var.number_of_workers: 5 #var.shared_credential_file: /etc/filebeat/aws_credentials @@ -95,7 +95,7 @@ Example config: elb: enabled: false #var.queue_url: https://sqs.myregion.amazonaws.com/123456/myqueue - #var.bucket: 'arn:aws:s3:::mybucket + #var.bucket: 'arn:aws:s3:::mybucket' #var.bucket_list_interval: 300s #var.number_of_workers: 5 #var.shared_credential_file: /etc/filebeat/aws_credentials @@ -112,7 +112,7 @@ Example config: s3access: enabled: false #var.queue_url: https://sqs.myregion.amazonaws.com/123456/myqueue - #var.bucket: 'arn:aws:s3:::mybucket + #var.bucket: 'arn:aws:s3:::mybucket' #var.bucket_list_interval: 300s #var.number_of_workers: 5 #var.shared_credential_file: /etc/filebeat/aws_credentials @@ -129,7 +129,7 @@ Example config: vpcflow: enabled: false #var.queue_url: https://sqs.myregion.amazonaws.com/123456/myqueue - #var.bucket: 'arn:aws:s3:::mybucket + #var.bucket: 'arn:aws:s3:::mybucket' #var.bucket_list_interval: 300s #var.number_of_workers: 5 #var.shared_credential_file: /etc/filebeat/aws_credentials