From 69c2f92c19bb259449f2d5cbffa75402b9826655 Mon Sep 17 00:00:00 2001 From: Roger Coll Date: Tue, 10 Dec 2024 15:57:07 +0100 Subject: [PATCH] docs: add EDOT colletor kube-stack Helm values (#5822) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * docs: add EDOT colletor kube-stack Helm values * chore: add changelog entry * Update deploy/helm/edot-collector/kube-stack/README.md Co-authored-by: Andrew Gizas * Update deploy/helm/edot-collector/kube-stack/README.md Co-authored-by: Andrew Gizas * Update deploy/helm/edot-collector/kube-stack/README.md Co-authored-by: Andrew Gizas * Update deploy/helm/edot-collector/kube-stack/README.md Co-authored-by: Andrew Gizas * Update deploy/helm/edot-collector/kube-stack/README.md Co-authored-by: Andrew Gizas * Update deploy/helm/edot-collector/kube-stack/README.md Co-authored-by: Andrew Gizas * Update deploy/helm/edot-collector/kube-stack/README.md Co-authored-by: Andrew Gizas * fix: values path relative path * add kube-stack Helm integration test * fix: relative values file * ci: load values file with Helm Go package * ci: remove agent's hardcoded image * Update deploy/helm/edot-collector/kube-stack/README.md Co-authored-by: Nathan L Smith * Update deploy/helm/edot-collector/kube-stack/README.md Co-authored-by: Nathan L Smith * Update deploy/helm/edot-collector/kube-stack/README.md Co-authored-by: Nathan L Smith * Update deploy/helm/edot-collector/kube-stack/README.md Co-authored-by: Nathan L Smith * Update deploy/helm/edot-collector/kube-stack/README.md Co-authored-by: Nathan L Smith * Update deploy/helm/edot-collector/kube-stack/README.md Co-authored-by: Nathan L Smith * Update deploy/helm/edot-collector/kube-stack/README.md Co-authored-by: Nathan L Smith * Update deploy/helm/edot-collector/kube-stack/README.md Co-authored-by: Nathan L Smith * add kube-stack values file to Update mage target * override elastic secrets * fix: parametrize helm release name * ci: eventually check created pods * test 5min timeout * Update deploy/helm/edot-collector/kube-stack/values.yaml Co-authored-by: Edu González de la Herrán <25320357+eedugon@users.noreply.github.com> * Update deploy/helm/edot-collector/kube-stack/README.md Co-authored-by: Edu González de la Herrán <25320357+eedugon@users.noreply.github.com> * Update deploy/helm/edot-collector/kube-stack/values.yaml Co-authored-by: Edu González de la Herrán <25320357+eedugon@users.noreply.github.com> * fix: use constant for Helm values paths * Update testing/integration/otel_helm_test.go Co-authored-by: Mikołaj Świątek * fix: reference kube-stack Chart variables * assert all pods are running * chore: clarify pod description comment --------- Co-authored-by: Andrew Gizas Co-authored-by: Nathan L Smith Co-authored-by: Edu González de la Herrán <25320357+eedugon@users.noreply.github.com> Co-authored-by: Mikołaj Świątek --- .../1729235281-kube-stack-helm-chart.yaml | 32 + .../helm/edot-collector/kube-stack/README.md | 66 ++ .../edot-collector/kube-stack/values.yaml | 945 ++++++++++++++++++ magefile.go | 161 +-- testing/integration/otel_helm_test.go | 190 ++++ 5 files changed, 1324 insertions(+), 70 deletions(-) create mode 100644 changelog/fragments/1729235281-kube-stack-helm-chart.yaml create mode 100644 deploy/helm/edot-collector/kube-stack/README.md create mode 100644 deploy/helm/edot-collector/kube-stack/values.yaml create mode 100644 testing/integration/otel_helm_test.go diff --git a/changelog/fragments/1729235281-kube-stack-helm-chart.yaml b/changelog/fragments/1729235281-kube-stack-helm-chart.yaml new file mode 100644 index 00000000000..44e36929e6d --- /dev/null +++ b/changelog/fragments/1729235281-kube-stack-helm-chart.yaml @@ -0,0 +1,32 @@ +# Kind can be one of: +# - breaking-change: a change to previously-documented behavior +# - deprecation: functionality that is being removed in a later release +# - bug-fix: fixes a problem in a previous version +# - enhancement: extends functionality but does not break or fix existing behavior +# - feature: new functionality +# - known-issue: problems that we are aware of in a given version +# - security: impacts on the security of a product or a user’s deployment. +# - upgrade: important information for someone upgrading from a prior version +# - other: does not fit into any of the other categories +kind: other + +# Change summary; a 80ish characters long description of the change. +summary: add EDOT colletor kube-stack Helm values + +# Long description; in case the summary is not enough to describe the change +# this field accommodate a description without length limits. +# NOTE: This field will be rendered only for breaking-change and known-issue kinds at the moment. +#description: + +# Affected component; usually one of "elastic-agent", "fleet-server", "filebeat", "metricbeat", "auditbeat", "all", etc. +component: "elastic-agent" + +# PR URL; optional; the PR number that added the changeset. +# If not present is automatically filled by the tooling finding the PR where this changelog fragment has been added. +# NOTE: the tooling supports backports, so it's able to fill the original PR number instead of the backport PR number. +# Please provide it if you are adding a fragment for a different PR. +#pr: https://github.com/owner/repo/1234 + +# Issue URL; optional; the GitHub issue related to this changeset (either closes or is part of). +# If not present is automatically filled by the tooling with the issue linked to the PR number. +#issue: https://github.com/owner/repo/1234 diff --git a/deploy/helm/edot-collector/kube-stack/README.md b/deploy/helm/edot-collector/kube-stack/README.md new file mode 100644 index 00000000000..b8298e8aee7 --- /dev/null +++ b/deploy/helm/edot-collector/kube-stack/README.md @@ -0,0 +1,66 @@ +## Kube-stack Helm Chart + +**More detailed documentation can be found [here](https://github.com/elastic/opentelemetry/blob/main/docs/kubernetes/operator/README.md).** + +The [kube-stack Helm Chart](https://github.com/open-telemetry/opentelemetry-helm-charts/tree/main/charts/opentelemetry-kube-stack#readme) is used to manage the installation of the OpenTelemetry operator (including its CRDs) and to configure a suite of EDOT collectors, which instrument various Kubernetes components to enable comprehensive observability and monitoring. + +The chart is installed with a provided default [`values.yaml`](./values.yaml) file that can be customized when needed. + +### DaemonSet collectors + +The OpenTelemetry components deployed within the DaemonSet EDOT collectors are responsible for observing specific signals from each node. To ensure complete data collection, these components must be deployed on every node in the cluster. Failing to do so will result in partial and potentially incomplete data. + +The DaemonSet collectors handle the following data: + +- Host Metrics: Collects host metrics specific to each node, utilizing the [hostmetrics receiver](https://github.com/open-telemetry/opentelemetry-collector-contrib/blob/main/receiver/hostmetricsreceiver/README.md) +- Kubernetes Metrics: Captures metrics related to the Kubernetes infrastructure on each node, utlilizing [kubeletstats](https://github.com/open-telemetry/opentelemetry-collector-contrib/blob/main/receiver/kubeletstatsreceiver/README.md) receiver +- Logs: Utilizes [File Log Receiver receiver](https://github.com/open-telemetry/opentelemetry-collector-contrib/tree/main/receiver/filelogreceiver#readme) to gather logs from all Pods running on the respective node. +- OTLP Traces: Utilizes [OTLP Receiver]( https://github.com/open-telemetry/opentelemetry-collector/blob/main/receiver/otlpreceiver#readme) which configures both HTTP and GRPC endpoints on the node to receive OTLP trace data. + +### Deployment collector + +The OpenTelemetry components deployed within a Deployment collector focus on gathering data at the cluster level rather than at individual nodes. A Deployment instance of the collector operates as a standalone (unlike DaemonSet collector instances, which are deployed on every node) + +The Deployment collector handles the following data: + +- Kubernetes Events: Monitors and collects events occurring across the entire Kubernetes cluster, utilizing [Kubernetes Objects Receiver](https://github.com/open-telemetry/opentelemetry-collector-contrib/tree/main/receiver/k8sobjectsreceiver#readme). +- Cluster Metrics: Captures metrics that provide insights into the overall health and performance of the Kubernetes cluster, utilizing [Kubernetes Cluster Receiver](https://github.com/open-telemetry/opentelemetry-collector-contrib/tree/main/receiver/k8sclusterreceiver#readme). + +### Auto-instrumentation + +The Helm Chart is configured to enable zero-code instrumentation using the [Operator's Instrumentation resource](https://github.com/open-telemetry/opentelemetry-operator/?tab=readme-ov-file#opentelemetry-auto-instrumentation-injection) for the following programming languages: + +- Go +- Java +- Node.js +- Python +- .NET + + +### Installation + +1. Create the `opentelemetry-operator-system` Kubernetes namespace: +``` +$ kubectl create namespace opentelemetry-operator-system +``` + +2. Create a secret in Kubernetes with the following command. + ``` + kubectl create -n opentelemetry-operator-system secret generic elastic-secret-otel \ + --from-literal=elastic_endpoint='YOUR_ELASTICSEARCH_ENDPOINT' \ + --from-literal=elastic_api_key='YOUR_ELASTICSEARCH_API_KEY' + ``` + Don't forget to replace + - `YOUR_ELASTICSEARCH_ENDPOINT`: your Elasticsearch endpoint (*with* `https://` prefix example: `https://1234567.us-west2.gcp.elastic-cloud.com:443`). + - `YOUR_ELASTICSEARCH_API_KEY`: your Elasticsearch API Key + +3. Execute the following commands to deploy the Helm Chart. + +``` +$ helm repo add open-telemetry https://open-telemetry.github.io/opentelemetry-helm-charts +$ helm repo update +$ helm upgrade --install --namespace opentelemetry-operator-system opentelemetry-kube-stack open-telemetry/opentelemetry-kube-stack --values ./values.yaml --version 0.3.3 + +> [!NOTE] +> Refer to the [compatibility matrix](https://github.com/elastic/opentelemetry/blob/main/docs/kubernetes/operator/README.md#compatibility-matrix) for a complete list of available manifests and associated helm chart versions. +``` diff --git a/deploy/helm/edot-collector/kube-stack/values.yaml b/deploy/helm/edot-collector/kube-stack/values.yaml new file mode 100644 index 00000000000..b55d531c310 --- /dev/null +++ b/deploy/helm/edot-collector/kube-stack/values.yaml @@ -0,0 +1,945 @@ +# For installation and configuration options, refer to the [installation instructions](https://github.com/elastic/opentelemetry/blob/main/docs/kubernetes/operator/README.md) + +# For advanced configuration options, refer to the [official OpenTelemetry Helm chart](https://github.com/open-telemetry/opentelemetry-helm-charts/blob/main/charts/opentelemetry-kube-stack/values.yaml) +# This file has been tested together with opentelemetry-kube-stack helm chart version: 0.3.3 +opentelemetry-operator: + manager: + extraArgs: + - --enable-go-instrumentation + admissionWebhooks: + certManager: + enabled: false # For production environments, it is [recommended to use cert-manager for better security and scalability](https://github.com/open-telemetry/opentelemetry-helm-charts/tree/main/charts/opentelemetry-operator#tls-certificate-requirement). + autoGenerateCert: + enabled: true # Enable/disable automatic certificate generation. Set to false if manually managing certificates. + recreate: true # Force certificate regeneration on updates. Only applicable if autoGenerateCert.enabled is true. +crds: + create: true # Install the OpenTelemetry Operator CRDs. +defaultCRConfig: + image: + repository: "docker.elastic.co/beats/elastic-agent" + tag: "9.0.0" + targetAllocator: + enabled: false # Enable/disable the Operator's Target allocator. + # Refer to: https://github.com/open-telemetry/opentelemetry-operator/tree/main/cmd/otel-allocator +clusterRole: + rules: + - apiGroups: [""] + resources: ["configmaps"] + verbs: ["get"] +# `clusterName` specifies the name of the Kubernetes cluster. It sets the 'k8s.cluster.name' field. +# Cluster Name is automatically detected for EKS/GKE/AKS. Add the below value in environments where cluster name cannot be detected. +# clusterName: myClusterName +collectors: + # Cluster is a K8s deployment EDOT collector focused on gathering telemetry + # at the cluster level (Kubernetes Events and cluster metrics). + cluster: + # Configure the pods resources to control CPU and memory usage. + # resources: + # limits: + # cpu: 100m + # memory: 500Mi + # requests: + # cpu: 100m + # memory: 500Mi + env: + - name: ELASTIC_AGENT_OTEL + value: '"true"' + - name: ELASTIC_ENDPOINT + valueFrom: + secretKeyRef: + name: elastic-secret-otel + key: elastic_endpoint + - name: ELASTIC_API_KEY + valueFrom: + secretKeyRef: + name: elastic-secret-otel + key: elastic_api_key + config: + exporters: + # [Debug exporter](https://github.com/open-telemetry/opentelemetry-collector/blob/main/exporter/debugexporter/README.md) + debug: + verbosity: basic # Options: basic, detailed. Choose verbosity level for debug logs. + # [Elasticsearch exporter](https://github.com/open-telemetry/opentelemetry-collector-contrib/blob/main/exporter/elasticsearchexporter/README.md) + elasticsearch/otel: + endpoints: # List of Elasticsearch endpoints. + - ${env:ELASTIC_ENDPOINT} + api_key: ${env:ELASTIC_API_KEY} # API key for Elasticsearch authentication. + logs_dynamic_index: + enabled: true + # Enable in order to skip the SSL certificate Check + # tls: + # insecure_skip_verify: true + mapping: + mode: otel + processors: + # [Resource Detection Processor](https://github.com/open-telemetry/opentelemetry-collector-contrib/tree/main/processor/resourcedetectionprocessor) + resourcedetection/eks: + detectors: [env, eks] # Detects resources from environment variables and EKS (Elastic Kubernetes Service). + timeout: 15s + override: true + eks: + resource_attributes: + k8s.cluster.name: + enabled: true + resourcedetection/gcp: + detectors: [env, gcp] # Detects resources from environment variables and GCP (Google Cloud Platform). + timeout: 2s + override: true + resourcedetection/aks: + detectors: [env, aks] # Detects resources from environment variables and AKS (Azure Kubernetes Service). + timeout: 2s + override: true + aks: + resource_attributes: + k8s.cluster.name: + enabled: true + # [Resource Processor](https://github.com/open-telemetry/opentelemetry-collector-contrib/tree/main/processor/resourceprocessor) + resource/k8s: # Resource attributes tailored for services within Kubernetes. + attributes: + - key: service.name # Set the service.name resource attribute based on the well-known app.kubernetes.io/name label + from_attribute: app.label.name + action: insert + - key: service.name # Set the service.name resource attribute based on the k8s.container.name attribute + from_attribute: k8s.container.name + action: insert + - key: app.label.name # Delete app.label.name attribute previously used for service.name + action: delete + - key: service.version # Set the service.version resource attribute based on the well-known app.kubernetes.io/version label + from_attribute: app.label.version + action: insert + - key: app.label.version # Delete app.label.version attribute previously used for service.version + action: delete + resource/hostname: + attributes: + - key: host.name + from_attribute: k8s.node.name + action: upsert + # [K8s Attributes Processor](https://github.com/open-telemetry/opentelemetry-collector-contrib/tree/main/processor/k8sattributesprocessor) + k8sattributes: + passthrough: false # Annotates resources with the pod IP and does not try to extract any other metadata. + pod_association: + # Below association takes a look at the k8s.pod.ip and k8s.pod.uid resource attributes or connection's context, and tries to match it with the pod having the same attribute. + - sources: + - from: resource_attribute + name: k8s.pod.ip + - sources: + - from: resource_attribute + name: k8s.pod.uid + - sources: + - from: connection + extract: + metadata: + - "k8s.namespace.name" + - "k8s.deployment.name" + - "k8s.replicaset.name" + - "k8s.statefulset.name" + - "k8s.daemonset.name" + - "k8s.cronjob.name" + - "k8s.job.name" + - "k8s.node.name" + - "k8s.pod.name" + - "k8s.pod.ip" + - "k8s.pod.uid" + - "k8s.pod.start_time" + labels: + - tag_name: app.label.name + key: app.kubernetes.io/name + from: pod + - tag_name: app.label.version + key: app.kubernetes.io/version + from: pod + receivers: + # [K8s Objects Receiver](https://github.com/open-telemetry/opentelemetry-collector-contrib/tree/main/receiver/k8sobjectsreceiver) + k8sobjects: + objects: + - name: events + mode: "watch" + group: "events.k8s.io" + exclude_watch_type: + - "DELETED" + # [K8s Cluster Receiver](https://github.com/open-telemetry/opentelemetry-collector-contrib/tree/main/receiver/k8sclusterreceiver) + k8s_cluster: + auth_type: serviceAccount # Determines how to authenticate to the K8s API server. This can be one of none (for no auth), serviceAccount (to use the standard service account token provided to the agent pod), or kubeConfig to use credentials from ~/.kube/config. + node_conditions_to_report: + - Ready + - MemoryPressure + allocatable_types_to_report: + - cpu + - memory + metrics: + k8s.pod.status_reason: + enabled: true + resource_attributes: + k8s.kubelet.version: + enabled: true + os.description: + enabled: true + os.type: + enabled: true + k8s.container.status.last_terminated_reason: + enabled: true + # [Service Section](https://opentelemetry.io/docs/collector/configuration/#service) + service: + pipelines: + metrics: + exporters: + - debug + - elasticsearch/otel + processors: + - k8sattributes + - resourcedetection/eks + - resourcedetection/gcp + - resourcedetection/aks + - resource/k8s + - resource/hostname + receivers: + - k8s_cluster + logs: + receivers: + - k8sobjects + processors: + - resourcedetection/eks + - resourcedetection/gcp + - resourcedetection/aks + - resource/hostname + exporters: + - debug + - elasticsearch/otel + # Daemon is a K8s daemonset EDOT collector focused on gathering telemetry at + # node level and exposing an OTLP endpoint for data ingestion. + # Auto-instrumentation SDKs will use this endpoint. + daemon: + # Configure the pods resources to control CPU and memory usage. + resources: + limits: + cpu: 1500m + memory: 1500Mi + requests: + cpu: 100m + memory: 500Mi + env: + # Work around for open /mounts error: https://github.com/open-telemetry/opentelemetry-collector-contrib/issues/35990 + - name: HOST_PROC_MOUNTINFO + value: "" + - name: ELASTIC_AGENT_OTEL + value: '"true"' + - name: ELASTIC_ENDPOINT + valueFrom: + secretKeyRef: + name: elastic-secret-otel + key: elastic_endpoint + - name: ELASTIC_API_KEY + valueFrom: + secretKeyRef: + name: elastic-secret-otel + key: elastic_api_key + - name: GOMAXPROCS + valueFrom: + resourceFieldRef: + resource: limits.cpu + - name: GOMEMLIMIT + value: "1025MiB" + presets: + logsCollection: + enabled: true # Enable/disable the collection of node's logs. + storeCheckpoints: true # Store checkpoints for log collection, allowing for resumption from the last processed log. + hostNetwork: true # Use the host's network namespace. This allows the daemon to access the network interfaces of the host directly. + securityContext: # Run the daemon as the root user and group for proper metrics collection. + runAsUser: 0 + runAsGroup: 0 + scrape_configs_file: "" # [Prometheus metrics](https://github.com/open-telemetry/opentelemetry-helm-charts/tree/main/charts/opentelemetry-kube-stack#scrape_configs_file-details) + config: + connectors: + # [Signal To Metrics Connector](https://github.com/elastic/opentelemetry-collector-components/tree/main/connector/signaltometricsconnector) + signaltometrics: # Produces metrics from all signal types (traces, logs, or metrics). + logs: + - name: service_summary + include_resource_attributes: + - key: service.name + - key: deployment.environment # service.environment + - key: telemetry.sdk.language # service.language.name + - key: agent.name # set via elastictraceprocessor + attributes: + - key: metricset.name + default_value: service_summary + sum: + value: "1" + datapoints: + - name: service_summary + include_resource_attributes: + - key: service.name + - key: deployment.environment # service.environment + - key: telemetry.sdk.language # service.language.name + - key: agent.name # set via elastictraceprocessor + attributes: + - key: metricset.name + default_value: service_summary + sum: + value: "1" + spans: + - name: service_summary + include_resource_attributes: + - key: service.name + - key: deployment.environment # service.environment + - key: telemetry.sdk.language # service.language.name + - key: agent.name # set via elastictraceprocessor + attributes: + - key: metricset.name + default_value: service_summary + sum: + value: Int(AdjustedCount()) + - name: transaction.duration.histogram + description: APM service transaction aggregated metrics as histogram + include_resource_attributes: + - key: service.name + - key: deployment.environment # service.environment + - key: telemetry.sdk.language # service.language.name + - key: agent.name # set via elastictraceprocessor + attributes: + - key: transaction.root + - key: transaction.type + - key: metricset.name + default_value: service_transaction + - key: elasticsearch.mapping.hints + default_value: [_doc_count] + unit: us + exponential_histogram: + value: Microseconds(end_time - start_time) + - name: transaction.duration.summary + description: APM service transaction aggregated metrics as summary + include_resource_attributes: + - key: service.name + - key: deployment.environment # service.environment + - key: telemetry.sdk.language # service.language.name + - key: agent.name # set via elastictraceprocessor + attributes: + - key: transaction.root + - key: transaction.type + - key: metricset.name + default_value: service_transaction + - key: elasticsearch.mapping.hints + default_value: [aggregate_metric_double] + unit: us + histogram: + buckets: [1] + value: Microseconds(end_time - start_time) + - name: transaction.duration.histogram + description: APM transaction aggregated metrics as histogram + ephemeral_resource_attribute: true + include_resource_attributes: + - key: service.name + - key: deployment.environment # service.environment + - key: telemetry.sdk.language # service.language.name + - key: agent.name # set via elastictraceprocessor + - key: container.id + - key: k8s.pod.name + - key: service.version + - key: service.instance.id # service.node.name + - key: process.runtime.name # service.runtime.name + - key: process.runtime.version # service.runtime.version + - key: telemetry.sdk.version # service.language.version?? + - key: host.name + - key: os.type # host.os.platform + - key: faas.instance + - key: faas.name + - key: faas.version + - key: cloud.provider + - key: cloud.region + - key: cloud.availability_zone + - key: cloud.platform # cloud.servicename + - key: cloud.account.id + attributes: + - key: transaction.root + - key: transaction.name + - key: transaction.type + - key: transaction.result + - key: event.outcome + - key: metricset.name + default_value: transaction + - key: elasticsearch.mapping.hints + default_value: [_doc_count] + unit: us + exponential_histogram: + value: Microseconds(end_time - start_time) + - name: transaction.duration.summary + description: APM transaction aggregated metrics as summary + ephemeral_resource_attribute: true + include_resource_attributes: + - key: service.name + - key: deployment.environment # service.environment + - key: telemetry.sdk.language # service.language.name + - key: agent.name # set via elastictraceprocessor + - key: container.id + - key: k8s.pod.name + - key: service.version + - key: service.instance.id # service.node.name + - key: process.runtime.name # service.runtime.name + - key: process.runtime.version # service.runtime.version + - key: telemetry.sdk.version # service.language.version?? + - key: host.name + - key: os.type # host.os.platform + - key: faas.instance + - key: faas.name + - key: faas.version + - key: cloud.provider + - key: cloud.region + - key: cloud.availability_zone + - key: cloud.platform # cloud.servicename + - key: cloud.account.id + attributes: + - key: transaction.root + - key: transaction.name + - key: transaction.type + - key: transaction.result + - key: event.outcome + - key: metricset.name + default_value: transaction + - key: elasticsearch.mapping.hints + default_value: [aggregate_metric_double] + unit: us + histogram: + buckets: [1] + value: Microseconds(end_time - start_time) + - name: span.destination.service.response_time.sum.us + description: APM span destination metrics + ephemeral_resource_attribute: true + include_resource_attributes: + - key: service.name + - key: deployment.environment # service.environment + - key: telemetry.sdk.language # service.language.name + - key: agent.name # set via elastictraceprocessor + attributes: + - key: span.name + - key: event.outcome + - key: service.target.type + - key: service.target.name + - key: span.destination.service.resource + - key: metricset.name + default_value: service_destination + unit: us + sum: + value: Double(Microseconds(end_time - start_time)) + - name: span.destination.service.response_time.count + description: APM span destination metrics + ephemeral_resource_attribute: true + include_resource_attributes: + - key: service.name + - key: deployment.environment # service.environment + - key: telemetry.sdk.language # service.language.name + - key: agent.name # set via elastictraceprocessor + attributes: + - key: span.name + - key: event.outcome + - key: service.target.type + - key: service.target.name + - key: span.destination.service.resource + - key: metricset.name + default_value: service_destination + sum: + value: Int(AdjustedCount()) + # event.success_count is populated using 2 metric definition with different conditions + # and value for the histogram bucket based on event outcome. Both metric definition + # are created using same name and attribute and will result in a single histogram. + # We use mapping hint of aggregate_metric_double, so, only the sum and the count + # values are required and the actual histogram bucket is ignored. + - name: event.success_count + description: Success count as a metric for service transaction + include_resource_attributes: + - key: service.name + - key: deployment.environment # service.environment + - key: telemetry.sdk.language # service.language.name + - key: agent.name # set via elastictraceprocessor + attributes: + - key: transaction.root + - key: transaction.type + - key: metricset.name + default_value: service_transaction + - key: elasticsearch.mapping.hints + default_value: [aggregate_metric_double] + conditions: + - attributes["event.outcome"] != nil and attributes["event.outcome"] == "success" + unit: us + histogram: + buckets: [1] + count: Int(AdjustedCount()) + value: Int(AdjustedCount()) + - name: event.success_count + description: Success count as a metric for service transaction + include_resource_attributes: + - key: service.name + - key: deployment.environment # service.environment + - key: telemetry.sdk.language # service.language.name + - key: agent.name # set via elastictraceprocessor + attributes: + - key: transaction.root + - key: transaction.type + - key: metricset.name + default_value: service_transaction + - key: elasticsearch.mapping.hints + default_value: [aggregate_metric_double] + conditions: + - attributes["event.outcome"] != nil and attributes["event.outcome"] != "success" + unit: us + histogram: + buckets: [0] + count: Int(AdjustedCount()) + value: Double(0) + exporters: + # [Debug exporter](https://github.com/open-telemetry/opentelemetry-collector/blob/main/exporter/debugexporter/README.md) + debug: + verbosity: basic + # [Elasticsearch exporter](https://github.com/open-telemetry/opentelemetry-collector-contrib/blob/main/exporter/elasticsearchexporter/README.md) + elasticsearch/otel: + endpoints: + - ${env:ELASTIC_ENDPOINT} + api_key: ${env:ELASTIC_API_KEY} + metrics_dynamic_index: + enabled: true + logs_dynamic_index: + enabled: true + traces_dynamic_index: + enabled: true + flush: + interval: 10s + # tls: + # insecure_skip_verify: true + mapping: + mode: otel + # [Elasticsearch exporter](https://github.com/open-telemetry/opentelemetry-collector-contrib/blob/main/exporter/elasticsearchexporter/README.md) + elasticsearch/ecs: + endpoints: + - ${env:ELASTIC_ENDPOINT} + api_key: ${env:ELASTIC_API_KEY} + # tls: + # insecure_skip_verify: true + mapping: + mode: ecs + processors: + # [Batch Processor](https://github.com/open-telemetry/opentelemetry-collector/tree/main/processor/batchprocessor) + batch: {} + # [Elastic Trace Processor](https://github.com/elastic/opentelemetry-collector-components/tree/main/processor/elastictraceprocessor) + elastictrace: {} # The processor enriches traces with elastic specific requirements. + # [LSM Interval Processor](https://github.com/elastic/opentelemetry-collector-components/tree/main/processor/lsmintervalprocessor) + lsminterval: + intervals: + - duration: 1m + statements: + - set(resource.attributes["metricset.interval"], "1m") + - set(attributes["data_stream.dataset"], Concat([attributes["metricset.name"], "1m"], ".")) + - set(attributes["processor.event"], "metric") + - duration: 10m + statements: + - set(resource.attributes["metricset.interval"], "10m") + - set(attributes["data_stream.dataset"], Concat([attributes["metricset.name"], "10m"], ".")) + - set(attributes["processor.event"], "metric") + - duration: 60m + statements: + - set(resource.attributes["metricset.interval"], "60m") + - set(attributes["data_stream.dataset"], Concat([attributes["metricset.name"], "60m"], ".")) + - set(attributes["processor.event"], "metric") + # [Elastic Infra Metrics Processor](https://github.com/elastic/opentelemetry-collector-components/tree/main/processor/elasticinframetricsprocessor) + elasticinframetrics: + add_system_metrics: true + add_k8s_metrics: true + drop_original: true + # [Resource Detection Processor](https://github.com/open-telemetry/opentelemetry-collector-contrib/tree/main/processor/resourcedetectionprocessor) + resourcedetection/eks: + detectors: [env, eks] # Detects resources from environment variables and EKS (Elastic Kubernetes Service). + timeout: 15s + override: true + eks: + resource_attributes: + k8s.cluster.name: + enabled: true + resourcedetection/gcp: + detectors: [env, gcp] # Detects resources from environment variables and GCP (Google Cloud Platform). + timeout: 2s + override: true + resourcedetection/aks: + detectors: [env, aks] # Detects resources from environment variables and AKS (Azure Kubernetes Service). + timeout: 2s + override: true + aks: + resource_attributes: + k8s.cluster.name: + enabled: true + resource/hostname: + attributes: + - key: host.name + from_attribute: k8s.node.name + action: upsert + resourcedetection/system: + detectors: ["system", "ec2"] # Detects resources from the system and EC2 instances. + system: + hostname_sources: ["os"] + resource_attributes: + host.name: + enabled: true + host.id: + enabled: false + host.arch: + enabled: true + host.ip: + enabled: true + host.mac: + enabled: true + host.cpu.vendor.id: + enabled: true + host.cpu.family: + enabled: true + host.cpu.model.id: + enabled: true + host.cpu.model.name: + enabled: true + host.cpu.stepping: + enabled: true + host.cpu.cache.l2.size: + enabled: true + os.description: + enabled: true + os.type: + enabled: true + ec2: + resource_attributes: + host.name: + enabled: false + host.id: + enabled: true + # [Resource Processor](https://github.com/open-telemetry/opentelemetry-collector-contrib/tree/main/processor/resourceprocessor) + resource/k8s: # Resource attributes tailored for services within Kubernetes. + attributes: + - key: service.name # Set the service.name resource attribute based on the well-known app.kubernetes.io/name label + from_attribute: app.label.name + action: insert + - key: service.name # Set the service.name resource attribute based on the k8s.container.name attribute + from_attribute: k8s.container.name + action: insert + - key: app.label.name # Delete app.label.name attribute previously used for service.name + action: delete + - key: service.version # Set the service.version resource attribute based on the well-known app.kubernetes.io/version label + from_attribute: app.label.version + action: insert + - key: app.label.version # Delete app.label.version attribute previously used for service.version + action: delete + resource/cloud: + attributes: + - key: cloud.instance.id + from_attribute: host.id + action: insert + resource/process: + attributes: + - key: process.executable.name + action: delete + - key: process.executable.path + action: delete + # [Attributes Processor](https://github.com/open-telemetry/opentelemetry-collector-contrib/tree/main/processor/attributesprocessor) + attributes/dataset: + actions: + - key: event.dataset + from_attribute: data_stream.dataset + action: upsert + # [K8s Attributes Processor](https://github.com/open-telemetry/opentelemetry-collector-contrib/tree/main/processor/k8sattributesprocessor) + k8sattributes: + filter: + # Only retrieve pods running on the same node as the collector + node_from_env_var: OTEL_K8S_NODE_NAME + passthrough: false + pod_association: + # Below association takes a look at the k8s.pod.ip and k8s.pod.uid resource attributes or connection's context, and tries to match it with the pod having the same attribute. + - sources: + - from: resource_attribute + name: k8s.pod.ip + - sources: + - from: resource_attribute + name: k8s.pod.uid + - sources: + - from: connection + extract: + metadata: + - "k8s.namespace.name" + - "k8s.deployment.name" + - "k8s.replicaset.name" + - "k8s.statefulset.name" + - "k8s.daemonset.name" + - "k8s.cronjob.name" + - "k8s.job.name" + - "k8s.node.name" + - "k8s.pod.name" + - "k8s.pod.ip" + - "k8s.pod.uid" + - "k8s.pod.start_time" + labels: + - tag_name: app.label.name + key: app.kubernetes.io/name + from: pod + - tag_name: app.label.version + key: app.kubernetes.io/version + from: pod + k8sattributes/ecs: + filter: + # Only retrieve pods running on the same node as the collector + node_from_env_var: OTEL_K8S_NODE_NAME + passthrough: false + pod_association: + # Below association takes a look at the k8s.pod.ip and k8s.pod.uid resource attributes or connection's context, and tries to match it with the pod having the same attribute. + - sources: + - from: resource_attribute + name: k8s.pod.ip + - sources: + - from: resource_attribute + name: k8s.pod.uid + - sources: + - from: connection + extract: + metadata: + - "k8s.replicaset.name" + - "k8s.statefulset.name" + - "k8s.daemonset.name" + - "k8s.cronjob.name" + - "k8s.job.name" + receivers: + # [OTLP Receiver](https://github.com/open-telemetry/opentelemetry-collector/tree/main/receiver/otlpreceiver) + otlp: + protocols: + grpc: + endpoint: 0.0.0.0:4317 + http: + endpoint: 0.0.0.0:4318 + # [File Log Receiver](https://github.com/open-telemetry/opentelemetry-collector-contrib/tree/main/receiver/filelogreceiver) + filelog: + retry_on_failure: + enabled: true + start_at: end + exclude: + # exlude collector logs + - /var/log/pods/opentelemetry-operator-system_opentelemetry-kube-stack*/*/*.log + include: + - /var/log/pods/*/*/*.log + include_file_name: false + include_file_path: true + operators: + - id: container-parser # Extract container's metadata + type: container + # [Hostmetrics Receiver](https://github.com/open-telemetry/opentelemetry-collector-contrib/tree/main/receiver/hostmetricsreceiver) + hostmetrics: + collection_interval: 10s + root_path: /hostfs # Mounted node's root file system + scrapers: + cpu: + metrics: + system.cpu.utilization: + enabled: true + system.cpu.logical.count: + enabled: true + memory: + metrics: + system.memory.utilization: + enabled: true + process: + mute_process_exe_error: true + mute_process_io_error: true + mute_process_user_error: true + metrics: + process.threads: + enabled: true + process.open_file_descriptors: + enabled: true + process.memory.utilization: + enabled: true + process.disk.operations: + enabled: true + network: {} + processes: {} + load: {} + disk: {} + filesystem: + exclude_mount_points: + mount_points: + - /dev/* + - /proc/* + - /sys/* + - /run/k3s/containerd/* + - /var/lib/docker/* + - /var/lib/kubelet/* + - /snap/* + match_type: regexp + exclude_fs_types: + fs_types: + - autofs + - binfmt_misc + - bpf + - cgroup2 + - configfs + - debugfs + - devpts + - devtmpfs + - fusectl + - hugetlbfs + - iso9660 + - mqueue + - nsfs + - overlay + - proc + - procfs + - pstore + - rpc_pipefs + - securityfs + - selinuxfs + - squashfs + - sysfs + - tracefs + match_type: strict + # [Kubelet Stats Receiver](https://github.com/open-telemetry/opentelemetry-collector-contrib/tree/main/receiver/kubeletstatsreceiver) + kubeletstats: + auth_type: serviceAccount # Authentication mechanism with the Kubelet endpoint, refer to: https://github.com/open-telemetry/opentelemetry-collector-contrib/tree/main/receiver/kubeletstatsreceiver#configuration + collection_interval: 20s + endpoint: ${env:OTEL_K8S_NODE_NAME}:10250 + node: '${env:OTEL_K8S_NODE_NAME}' + # Required to work for all CSPs without an issue + insecure_skip_verify: true + k8s_api_config: + auth_type: serviceAccount + metrics: + k8s.pod.memory.node.utilization: + enabled: true + k8s.pod.cpu.node.utilization: + enabled: true + k8s.container.cpu_limit_utilization: + enabled: true + k8s.pod.cpu_limit_utilization: + enabled: true + k8s.container.cpu_request_utilization: + enabled: true + k8s.container.memory_limit_utilization: + enabled: true + k8s.pod.memory_limit_utilization: + enabled: true + k8s.container.memory_request_utilization: + enabled: true + k8s.node.uptime: + enabled: true + k8s.node.cpu.usage: + enabled: true + k8s.pod.cpu.usage: + enabled: true + extra_metadata_labels: + - container.id + # [Service Section](https://opentelemetry.io/docs/collector/configuration/#service) + service: + pipelines: + logs/node: + receivers: + - filelog + processors: + - batch + - k8sattributes + - resourcedetection/system + - resourcedetection/eks + - resourcedetection/gcp + - resourcedetection/aks + - resource/k8s + - resource/hostname + - resource/cloud + exporters: + - debug + - elasticsearch/otel + metrics/node/otel: + receivers: + - kubeletstats + processors: + - batch + - k8sattributes + - resourcedetection/system + - resourcedetection/eks + - resourcedetection/gcp + - resourcedetection/aks + - resource/k8s + - resource/hostname + - resource/cloud + exporters: + - debug + - elasticsearch/otel + metrics/node/ecs: + receivers: + - hostmetrics + - kubeletstats + processors: + - elasticinframetrics + - batch + - k8sattributes/ecs + - resourcedetection/system + - resourcedetection/eks + - resourcedetection/gcp + - resourcedetection/aks + - resource/k8s + - resource/hostname + - resource/cloud + - attributes/dataset + - resource/process + exporters: + - debug + - elasticsearch/ecs + metrics/otel-apm: + receivers: + - otlp + processors: + - batch + - resource/hostname + exporters: + - debug + - signaltometrics + - elasticsearch/otel + logs/apm: + receivers: + - otlp + processors: + - batch + - resource/hostname + exporters: + - debug + - signaltometrics + - elasticsearch/otel + traces/apm: + receivers: + - otlp + processors: + - batch + - elastictrace + - resource/hostname + exporters: + - debug + - signaltometrics + - elasticsearch/otel + metrics/aggregated-otel-metrics: + receivers: + - signaltometrics + processors: + - batch + - lsminterval + exporters: + - debug + - elasticsearch/otel +# For more details on OpenTelemetry's zero-code instrumentation, see: +# https://opentelemetry.io/docs/concepts/instrumentation/zero-code/ +instrumentation: + name: elastic-instrumentation + enabled: true # Enable/disable auto-instrumentation. + exporter: + endpoint: http://opentelemetry-kube-stack-daemon-collector.opentelemetry-operator-system.svc.cluster.local:4318 # The daemonset OpenTelemetry Collector endpoint where telemetry data will be exported. + propagators: + - tracecontext # W3C TraceContext propagator for distributed tracing. + - baggage # Baggage propagator to include baggage information in trace context. + - b3 # B3 propagator for Zipkin-based distributed tracing compatibility. + sampler: + type: parentbased_traceidratio # Sampler type + argument: "1.0" # Sampling rate set to 100% (all traces are sampled). + java: + image: docker.elastic.co/observability/elastic-otel-javaagent:1.0.0 + nodejs: + image: docker.elastic.co/observability/elastic-otel-node:0.4.1 + dotnet: + image: docker.elastic.co/observability/elastic-otel-dotnet:edge + python: + image: docker.elastic.co/observability/elastic-otel-python:0.3.0 + go: + image: ghcr.io/open-telemetry/opentelemetry-go-instrumentation/autoinstrumentation-go:v0.14.0-alpha diff --git a/magefile.go b/magefile.go index 7512386fb0e..d2a26565946 100644 --- a/magefile.go +++ b/magefile.go @@ -94,9 +94,9 @@ const ( baseURLForStagingDRA = "https://staging.elastic.co/" agentCoreProjectName = "elastic-agent-core" - helmChartPath = "./deploy/helm/elastic-agent" - - sha512FileExt = ".sha512" + helmChartPath = "./deploy/helm/elastic-agent" + helmOtelChartPath = "./deploy/helm/edot-collector/kube-stack" + sha512FileExt = ".sha512" ) var ( @@ -114,6 +114,25 @@ var ( goIntegTestTimeout = 2 * time.Hour // goProvisionAndTestTimeout is the timeout used for both provisioning and running tests. goProvisionAndTestTimeout = goIntegTestTimeout + 30*time.Minute + + helmChartsValues = []struct { + path string + versionKeys []string + tagKeys []string + }{ + // elastic-agent Helm Chart + { + helmChartPath, + []string{"agent", "version"}, + []string{"agent", "image", "tag"}, + }, + // edot-collector values file for kube-stack Helm Chart + { + helmOtelChartPath, + []string{"defaultCRConfig", "image", "tag"}, + nil, + }, + } ) func init() { @@ -397,7 +416,7 @@ func (Build) TestBinaries() error { if err != nil { return err } - err = os.Chmod(outputName, 0755) + err = os.Chmod(outputName, 0o755) if err != nil { return err } @@ -631,7 +650,7 @@ func GoInstall(link string) error { // Mkdir returns a function that create a directory. func Mkdir(dir string) func() error { return func() error { - if err := os.MkdirAll(dir, 0700); err != nil { + if err := os.MkdirAll(dir, 0o700); err != nil { return fmt.Errorf("failed to create directory: %v, error: %+v", dir, err) } return nil @@ -989,7 +1008,7 @@ func packageAgent(ctx context.Context, platforms []string, dependenciesVersion s if mg.Verbose() { log.Printf("--- creating flat dir in .elastic-agent_flat") } - os.MkdirAll(flatPath, 0755) + os.MkdirAll(flatPath, 0o755) defer os.RemoveAll(flatPath) // extract all dependencies from their archives into flat dir @@ -1061,7 +1080,7 @@ func collectPackageDependencies(platforms []string, packageVersion string, platf continue } targetPath := filepath.Join(archivePath, manifest.PlatformPackages[platform]) - os.MkdirAll(targetPath, 0755) + os.MkdirAll(targetPath, 0o755) packageName := spec.GetPackageName(packageVersion, platform) errGroup.Go(downloadBinary(ctx, spec.ProjectName, packageName, spec.BinaryName, platform, packageVersion, targetPath, completedDownloads)) } @@ -1111,7 +1130,7 @@ func collectPackageDependencies(platforms []string, packageVersion string, platf } targetPath := filepath.Join(archivePath, rp) - os.MkdirAll(targetPath, 0755) + os.MkdirAll(targetPath, 0o755) for _, f := range files { // safety check; if the user has an older version of the beats repo, // for example right after a release where you've `git pulled` from on repo and not the other, @@ -1170,9 +1189,9 @@ func flattenDependencies(requiredPackages []string, packageVersion, archivePath, targetPath := filepath.Join(archivePath, rp) versionedFlatPath := filepath.Join(flatPath, rp) versionedDropPath := filepath.Join(dropPath, rp) - os.MkdirAll(targetPath, 0755) - os.MkdirAll(versionedFlatPath, 0755) - os.MkdirAll(versionedDropPath, 0755) + os.MkdirAll(targetPath, 0o755) + os.MkdirAll(versionedFlatPath, 0o755) + os.MkdirAll(versionedDropPath, 0o755) // untar all matches, err := filepath.Glob(filepath.Join(targetPath, "*tar.gz")) @@ -1585,13 +1604,13 @@ func appendComponentChecksums(versionedDropPath string, checksums map[string]str return err } - return os.WriteFile(filepath.Join(versionedDropPath, checksumFilename), content, 0644) + return os.WriteFile(filepath.Join(versionedDropPath, checksumFilename), content, 0o644) } // movePackagesToArchive Create archive folder and move any pre-existing artifacts into it. func movePackagesToArchive(dropPath string, platformPackageSuffixes []string, packageVersion string) string { archivePath := filepath.Join(dropPath, "archives") - os.MkdirAll(archivePath, 0755) + os.MkdirAll(archivePath, 0o755) // move archives to archive path matches, err := filepath.Glob(filepath.Join(dropPath, "*tar.gz*")) @@ -1630,7 +1649,7 @@ func movePackagesToArchive(dropPath string, platformPackageSuffixes []string, pa targetPath := filepath.Join(archivePath, packageSuffix, filepath.Base(f)) targetDir := filepath.Dir(targetPath) - if err := os.MkdirAll(targetDir, 0750); err != nil { + if err := os.MkdirAll(targetDir, 0o750); err != nil { fmt.Printf("warning: failed to create directory %s: %s", targetDir, err) } @@ -1814,7 +1833,7 @@ func saveIronbank() error { distributionsDir := "build/distributions" if _, err := os.Stat(distributionsDir); os.IsNotExist(err) { - err := os.MkdirAll(distributionsDir, 0750) + err := os.MkdirAll(distributionsDir, 0o750) if err != nil { return fmt.Errorf("cannot create folder for docker artifacts: %+v", err) } @@ -2054,7 +2073,7 @@ func (Integration) UpdateVersions(ctx context.Context) error { versionFileData := upgradetest.AgentVersions{ TestVersions: versions, } - file, err := os.OpenFile(upgradetest.AgentVersionsFilename, os.O_CREATE|os.O_TRUNC|os.O_WRONLY, 0644) + file, err := os.OpenFile(upgradetest.AgentVersionsFilename, os.O_CREATE|os.O_TRUNC|os.O_WRONLY, 0o644) if err != nil { return fmt.Errorf("failed to open %s for write: %w", upgradetest.AgentVersionsFilename, err) } @@ -2095,7 +2114,7 @@ func (Integration) UpdatePackageVersion(ctx context.Context) error { return fmt.Errorf("expected a single version, got %v", versions) } packageVersion := versions[0].CoreVersion() - file, err := os.OpenFile(packageVersionFilename, os.O_CREATE|os.O_TRUNC|os.O_WRONLY, 0644) + file, err := os.OpenFile(packageVersionFilename, os.O_CREATE|os.O_TRUNC|os.O_WRONLY, 0o644) if err != nil { return fmt.Errorf("failed to open %s for write: %w", packageVersionFilename, err) } @@ -2669,15 +2688,15 @@ func integRunnerOnce(ctx context.Context, matrix bool, singleTest string) (int, _ = os.Remove("build/TEST-go-integration.out") _ = os.Remove("build/TEST-go-integration.out.json") _ = os.Remove("build/TEST-go-integration.xml") - err = writeFile("build/TEST-go-integration.out", results.Output, 0644) + err = writeFile("build/TEST-go-integration.out", results.Output, 0o644) if err != nil { return 0, fmt.Errorf("error writing test out file: %w", err) } - err = writeFile("build/TEST-go-integration.out.json", results.JSONOutput, 0644) + err = writeFile("build/TEST-go-integration.out.json", results.JSONOutput, 0o644) if err != nil { return 0, fmt.Errorf("error writing test out json file: %w", err) } - err = writeFile("build/TEST-go-integration.xml", results.XMLOutput, 0644) + err = writeFile("build/TEST-go-integration.xml", results.XMLOutput, 0o644) if err != nil { return 0, fmt.Errorf("error writing test out xml file: %w", err) } @@ -2843,7 +2862,7 @@ func createTestRunner(matrix bool, singleTest string, goTestFlags string, batche } diagDir := filepath.Join("build", "diagnostics") - _ = os.MkdirAll(diagDir, 0755) + _ = os.MkdirAll(diagDir, 0o755) cfg := tcommon.Config{ AgentVersion: agentVersion, @@ -3129,11 +3148,11 @@ func authESS(ctx context.Context) error { } _, err = os.Stat(essAPIKeyFile) if os.IsNotExist(err) { - if err := os.MkdirAll(filepath.Dir(essAPIKeyFile), 0700); err != nil { + if err := os.MkdirAll(filepath.Dir(essAPIKeyFile), 0o700); err != nil { return fmt.Errorf("unable to create ESS config directory: %w", err) } - if err := os.WriteFile(essAPIKeyFile, nil, 0600); err != nil { + if err := os.WriteFile(essAPIKeyFile, nil, 0o600); err != nil { return fmt.Errorf("unable to initialize ESS API key file: %w", err) } } else if err != nil { @@ -3173,7 +3192,7 @@ func authESS(ctx context.Context) error { } // Write API key to file for future use - if err := os.WriteFile(essAPIKeyFile, []byte(essAPIKey), 0600); err != nil { + if err := os.WriteFile(essAPIKeyFile, []byte(essAPIKey), 0o600); err != nil { return fmt.Errorf("unable to persist ESS API key for future use: %w", err) } @@ -3245,7 +3264,7 @@ func (Otel) Readme() error { } // resolve template - out, err := os.OpenFile(readmeOut, os.O_CREATE|os.O_TRUNC|os.O_WRONLY, 0644) + out, err := os.OpenFile(readmeOut, os.O_CREATE|os.O_TRUNC|os.O_WRONLY, 0o644) if err != nil { return fmt.Errorf("failed to open file %s: %w", readmeOut, err) } @@ -3485,63 +3504,65 @@ func (Helm) RenderExamples() error { } func (Helm) UpdateAgentVersion() error { - valuesFile := filepath.Join(helmChartPath, "values.yaml") + for _, chart := range helmChartsValues { + valuesFile := filepath.Join(chart.path, "values.yaml") - data, err := os.ReadFile(valuesFile) - if err != nil { - return fmt.Errorf("failed to read file: %w", err) - } + data, err := os.ReadFile(valuesFile) + if err != nil { + return fmt.Errorf("failed to read file: %w", err) + } - isTagged, err := devtools.TagContainsCommit() - if err != nil { - return fmt.Errorf("failed to check if tag contains commit: %w", err) - } + isTagged, err := devtools.TagContainsCommit() + if err != nil { + return fmt.Errorf("failed to check if tag contains commit: %w", err) + } - if !isTagged { - isTagged = os.Getenv(snapshotEnv) != "" - } + if !isTagged { + isTagged = os.Getenv(snapshotEnv) != "" + } - agentVersion := getVersion() + agentVersion := getVersion() - // Parse YAML into a Node structure because - // it maintains comments - var rootNode yaml.Node - err = yaml.Unmarshal(data, &rootNode) - if err != nil { - return fmt.Errorf("failed to unmarshal YAML: %w", err) - } + // Parse YAML into a Node structure because + // it maintains comments + var rootNode yaml.Node + err = yaml.Unmarshal(data, &rootNode) + if err != nil { + return fmt.Errorf("failed to unmarshal YAML: %w", err) + } - if rootNode.Kind != yaml.DocumentNode { - return fmt.Errorf("root node is not a document node") - } else if len(rootNode.Content) == 0 { - return fmt.Errorf("root node has no content") - } + if rootNode.Kind != yaml.DocumentNode { + return fmt.Errorf("root node is not a document node") + } else if len(rootNode.Content) == 0 { + return fmt.Errorf("root node has no content") + } - if err := updateYamlNodes(rootNode.Content[0], agentVersion, "agent", "version"); err != nil { - return fmt.Errorf("failed to update agent version: %w", err) - } + if err := updateYamlNodes(rootNode.Content[0], agentVersion, chart.versionKeys...); err != nil { + return fmt.Errorf("failed to update agent version: %w", err) + } - if !isTagged { - if err := updateYamlNodes(rootNode.Content[0], fmt.Sprintf("%s-SNAPSHOT", agentVersion), "agent", "image", "tag"); err != nil { - return fmt.Errorf("failed to update agent image tag: %w", err) + if !isTagged && len(chart.tagKeys) > 0 { + if err := updateYamlNodes(rootNode.Content[0], fmt.Sprintf("%s-SNAPSHOT", agentVersion), chart.tagKeys...); err != nil { + return fmt.Errorf("failed to update agent image tag: %w", err) + } } - } - // Truncate values file - file, err := os.Create(valuesFile) - if err != nil { - return fmt.Errorf("failed to open file for writing: %w", err) - } - defer file.Close() + // Truncate values file + file, err := os.Create(valuesFile) + if err != nil { + return fmt.Errorf("failed to open file for writing: %w", err) + } + defer file.Close() - // Create a YAML encoder with 2-space indentation - encoder := yaml.NewEncoder(file) - encoder.SetIndent(2) + // Create a YAML encoder with 2-space indentation + encoder := yaml.NewEncoder(file) + encoder.SetIndent(2) - // Encode the updated YAML node back to the file - err = encoder.Encode(&rootNode) - if err != nil { - return fmt.Errorf("failed to encode updated YAML: %w", err) + // Encode the updated YAML node back to the file + err = encoder.Encode(&rootNode) + if err != nil { + return fmt.Errorf("failed to encode updated YAML: %w", err) + } } return nil diff --git a/testing/integration/otel_helm_test.go b/testing/integration/otel_helm_test.go new file mode 100644 index 00000000000..583c18441a9 --- /dev/null +++ b/testing/integration/otel_helm_test.go @@ -0,0 +1,190 @@ +// Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one +// or more contributor license agreements. Licensed under the Elastic License 2.0; +// you may not use this file except in compliance with the Elastic License 2.0. + +//go:build integration + +package integration + +import ( + "context" + "crypto/sha256" + "encoding/base64" + "fmt" + "os" + "path/filepath" + "strings" + "testing" + "time" + + "github.com/stretchr/testify/require" + "helm.sh/helm/v3/pkg/action" + "helm.sh/helm/v3/pkg/chart/loader" + "helm.sh/helm/v3/pkg/cli" + "helm.sh/helm/v3/pkg/cli/values" + "helm.sh/helm/v3/pkg/getter" + corev1 "k8s.io/api/core/v1" + + "github.com/elastic/elastic-agent/pkg/testing/define" +) + +var ( + kubeStackChartVersion = "0.3.2" + kubeStackChartURL = "https://github.com/open-telemetry/opentelemetry-helm-charts/releases/download/opentelemetry-kube-stack-" + kubeStackChartVersion + "/opentelemetry-kube-stack-" + kubeStackChartVersion + ".tgz" +) + +func TestOtelKubeStackHelm(t *testing.T) { + info := define.Require(t, define.Requirements{ + Stack: &define.Stack{}, + Local: false, + Sudo: false, + OS: []define.OS{ + // only test the basic and the wolfi container with otel + {Type: define.Kubernetes, DockerVariant: "basic"}, + {Type: define.Kubernetes, DockerVariant: "wolfi"}, + }, + Group: define.Kubernetes, + }) + + agentImage := os.Getenv("AGENT_IMAGE") + require.NotEmpty(t, agentImage, "AGENT_IMAGE must be set") + + agentImageParts := strings.SplitN(agentImage, ":", 2) + require.Len(t, agentImageParts, 2, "AGENT_IMAGE must be in the form ':'") + agentImageRepo := agentImageParts[0] + agentImageTag := agentImageParts[1] + + client, err := info.KubeClient() + require.NoError(t, err) + require.NotNil(t, client) + + testLogsBasePath := os.Getenv("K8S_TESTS_POD_LOGS_BASE") + require.NotEmpty(t, testLogsBasePath, "K8S_TESTS_POD_LOGS_BASE must be set") + + err = os.MkdirAll(filepath.Join(testLogsBasePath, t.Name()), 0o755) + require.NoError(t, err, "failed to create test logs directory") + + namespace := info.Namespace + + esHost := os.Getenv("ELASTICSEARCH_HOST") + require.NotEmpty(t, esHost, "ELASTICSEARCH_HOST must be set") + + esAPIKey, err := generateESAPIKey(info.ESClient, namespace) + require.NoError(t, err, "failed to generate ES API key") + require.NotEmpty(t, esAPIKey, "failed to generate ES API key") + + chartOptions := &action.ChartPathOptions{ + RepoURL: kubeStackChartURL, + Version: kubeStackChartVersion, + } + + chartLocation, err := action.NewPull().LocateChart(chartOptions.RepoURL, cli.New()) + if err != nil { + panic(err) + } + + testCases := []struct { + name string + helmReleaseName string + valuesFile string + atLeastValidatedPodsNumber int + }{ + { + name: "helm standalone agent default kubernetes privileged", + helmReleaseName: "kube-stack-otel", + valuesFile: "../../deploy/helm/edot-collector/kube-stack/values.yaml", + // - A Daemonset to collect K8s node's metrics and logs + // (1 EDOT collector pod per node) + // - A Cluster wide Deployment to collect K8s metrics and + // events (1 EDOT collector pod per cluster) + // - An OpenTelemetry Operator Deployment (1 pod per + // cluster) + atLeastValidatedPodsNumber: 3, + }, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + ctx := context.Background() + hasher := sha256.New() + hasher.Write([]byte(tc.name)) + testNamespace := strings.ToLower(base64.URLEncoding.EncodeToString(hasher.Sum(nil))) + testNamespace = noSpecialCharsRegexp.ReplaceAllString(testNamespace, "") + + settings := cli.New() + settings.SetNamespace(testNamespace) + actionConfig := &action.Configuration{} + + helmChart, err := loader.Load(chartLocation) + require.NoError(t, err, "failed to load helm chart") + + err = actionConfig.Init(settings.RESTClientGetter(), settings.Namespace(), "", + func(format string, v ...interface{}) {}) + require.NoError(t, err, "failed to init helm action config") + + // Initialize a map to hold the parsed data + helmValues := make(map[string]any) + + options := values.Options{ + ValueFiles: []string{tc.valuesFile}, + Values: []string{fmt.Sprintf("defaultCRConfig.image.repository=%s", agentImageRepo), fmt.Sprintf("defaultCRConfig.image.tag=%s", agentImageTag)}, + + // override secrets reference with env variables + JSONValues: []string{ + fmt.Sprintf(`collectors.cluster.env[1]={"name":"ELASTIC_ENDPOINT","value":"%s"}`, esHost), + fmt.Sprintf(`collectors.cluster.env[2]={"name":"ELASTIC_API_KEY","value":"%s"}`, esAPIKey), + fmt.Sprintf(`collectors.daemon.env[2]={"name":"ELASTIC_ENDPOINT","value":"%s"}`, esHost), + fmt.Sprintf(`collectors.daemon.env[3]={"name":"ELASTIC_API_KEY","value":"%s"}`, esAPIKey), + }, + } + providers := getter.All(settings) + helmValues, err = options.MergeValues(providers) + if err != nil { + require.NoError(t, err, "failed to helm values") + } + + t.Cleanup(func() { + if t.Failed() { + dumpLogs(t, ctx, client, testNamespace, testLogsBasePath) + } + + uninstallAction := action.NewUninstall(actionConfig) + uninstallAction.Wait = true + + _, err = uninstallAction.Run(tc.helmReleaseName) + if err != nil { + require.NoError(t, err, "failed to uninstall helm chart") + } + }) + + installAction := action.NewInstall(actionConfig) + installAction.Namespace = testNamespace + installAction.CreateNamespace = true + installAction.UseReleaseName = true + installAction.ReleaseName = tc.helmReleaseName + installAction.Timeout = 2 * time.Minute + installAction.Wait = true + installAction.WaitForJobs = true + _, err = installAction.Run(helmChart, helmValues) + require.NoError(t, err, "failed to install helm chart") + + // Pods are created by the OpenTelemetry Operator, it + // takes some time for the OpenTelemetry Operator to be + // ready + require.Eventually(t, func() bool { + podList := &corev1.PodList{} + err = client.Resources(testNamespace).List(ctx, podList) + require.NoError(t, err, fmt.Sprintf("failed to list pods in namespace %s", testNamespace)) + + checkedAgentContainers := 0 + + for _, pod := range podList.Items { + if strings.HasPrefix(pod.GetName(), tc.helmReleaseName) && pod.Status.Phase == corev1.PodRunning { + checkedAgentContainers++ + } + } + return checkedAgentContainers >= tc.atLeastValidatedPodsNumber + }, 5*time.Minute, 10*time.Second, fmt.Sprintf("at least %d agent containers should be checked", tc.atLeastValidatedPodsNumber)) + }) + } +}