diff --git a/config/config_test.go b/config/config_test.go index d8fc96ff77..e9a210d9a6 100644 --- a/config/config_test.go +++ b/config/config_test.go @@ -136,7 +136,7 @@ MetricsConfig: } func TestReadDefaults(t *testing.T) { - c, err := NewConfig("../config_complete.yaml", "../rules_complete.yaml", func(err error) {}) + c, err := NewConfig("../config_unit_tests.yaml", "../rules_unit_tests.yaml", func(err error) {}) if err != nil { t.Error(err) @@ -187,7 +187,7 @@ func TestReadDefaults(t *testing.T) { } func TestReadRulesConfig(t *testing.T) { - c, err := NewConfig("../config_complete.yaml", "../rules_complete.yaml", func(err error) {}) + c, err := NewConfig("../config_unit_tests.yaml", "../rules_unit_tests.yaml", func(err error) {}) if err != nil { t.Error(err) diff --git a/config_unit_tests.yaml b/config_unit_tests.yaml new file mode 100644 index 0000000000..5073128f96 --- /dev/null +++ b/config_unit_tests.yaml @@ -0,0 +1,406 @@ +######################## +## Trace Proxy Config ## +######################## + +# ListenAddr is the IP and port on which to listen for incoming events. Incoming +# traffic is expected to be HTTP, so if using SSL put something like nginx in +# front to do the TLS Termination. +ListenAddr: 0.0.0.0:8082 + +# GRPCListenAddr is the IP and port on which to listen for incoming events over +# gRPC. Incoming traffic is expected to be unencrypted, so if using SSL put something like nginx in +# front to do the TLS Termination. +GRPCListenAddr: 0.0.0.0:9090 + +# PeerListenAddr is the IP and port on which to listen for traffic being +# rerouted from a peer. Peer traffic is expected to be HTTP, so if using SSL +# put something like nginx in front to do the decryption. Must be different from +# ListenAddr +PeerListenAddr: 0.0.0.0:8083 + +GRPCPeerListenAddr: 0.0.0.0:8084 + +# CompressPeerCommunication determines whether to compress span data +# it forwards to peers. If it costs money to transmit data between different +# instances (e.g. they're spread across AWS availability zones), then you +# almost certainly want compression enabled to reduce your bill. The option to +# disable it is provided as an escape hatch for deployments that value lower CPU +# utilization over data transfer costs. +CompressPeerCommunication: true + +# OpsrampAPI is the URL for the upstream Opsramp API. +OpsrampAPI: "https://portal.opsramp.net" + +# Dataset you want to use for sampling +Dataset: "ds" + +#Tls Options +UseTls: true +UseTlsInsecure: false + +# LoggingLevel valid options are "debug", "info", "error", and "panic". +LoggingLevel: error + +# SendDelay is a short timer that will be triggered when a trace is complete. +# Trace Proxy will wait for this duration before actually sending the trace. The +# reason for this short delay is to allow for small network delays or clock +# jitters to elapse and any final spans to arrive before actually sending the +# trace. This supports duration strings with supplied units. Set to 0 for +# immediate sends. +SendDelay: 2s + +# BatchTimeout dictates how frequently to send unfulfilled batches. By default +# this will use the DefaultBatchTimeout in libtrace as its value, which is 100ms. +# Eligible for live reload. +BatchTimeout: 1s + +# TraceTimeout is a long timer; it represents the outside boundary of how long +# to wait before sending an incomplete trace. Normally traces are sent when the +# root span arrives. Sometimes the root span never arrives (due to crashes or +# whatever), and this timer will send a trace even without having received the +# root span. If you have particularly long-lived traces you should increase this +# timer. This supports duration strings with supplied units. +TraceTimeout: 60s + +# MaxBatchSize is the number of events to be included in the batch for sending +MaxBatchSize: 500 + +# SendTicker is a short timer; it determines the duration to use to check for traces to send +SendTicker: 100ms + +# UpstreamBufferSize and PeerBufferSize control how large of an event queue to use +# when buffering events that will be forwarded to peers or the upstream API. +UpstreamBufferSize: 1000 +PeerBufferSize: 1000 + +# AddHostMetadataToTrace determines whether to add information about +# the host that tracing proxy is running on to the spans that it processes. +# If enabled, information about the host will be added to each span with the +# key 'meta.local_hostname'. +AddHostMetadataToTrace: false + +# AddAdditionalMetadata adds all the specified key value pairs to the traces and metrics +# the values must be a valid json key value pair like eg: {"key_1":"value_1", "key_2":"value_2"} +# max number of additional keys supported is 5, if the limit exceeds then we considered the first 5 +# based on sorted order of keys +# "app" label is mandatory +AddAdditionalMetadata: { "app": "default" } + +# EnvironmentCacheTTL is the amount of time a cache entry will live that associates +# an API key with an environment name. +# Cache misses lookup the environment name using OpsRampAPI config value. +# Default is 1 hour ("1h"). +EnvironmentCacheTTL: "1h" + +# QueryAuthToken, if specified, provides a token that must be specified with +# the header "X-OpsRamp-Tracing-Proxy-Query" in order for a /query request to succeed. +# These /query requests are intended for debugging OpsRamp-Tracing-Proxy installations and +# are not typically needed in normal operation. +# Can be specified in the environment as TRACING_PROXY_QUERY_AUTH_TOKEN. +# If left unspecified, the /query endpoints are inaccessible. +# QueryAuthToken: "some-random-value" + +# AddRuleReasonToTrace causes traces that are sent to OpsRamp to include a field which +# contains text indicating which rule was evaluated that caused the trace to be included. +AddRuleReasonToTrace: true + +# AdditionalErrorFields should be a list of span fields that should be included when logging +# errors that happen during ingestion of events (for example, the span too large error). +# This is primarily useful in trying to track down misbehaving senders in a large installation. +# The fields `dataset`, `apihost`, and `environment` are always included. +# If a field is not present in the span, it will not be present in the error log. +# Default is ["trace.span_id"]. +AdditionalErrorFields: + - trace.span_id + +# AddSpanCountToRoot adds a new metadata field, `meta.span_count` to root spans to indicate +# the number of child spans on the trace at the time the sampling decision was made. +# This value is available to the rules-based sampler, making it possible to write rules that +# are dependent upon the number of spans in the trace. +# Default is false. +AddSpanCountToRoot: false + +# CacheOverrunStrategy controls the cache management behavior under memory pressure. +# "resize" means that when a cache overrun occurs, the cache is shrunk and never grows again, +# which is generally not helpful unless it occurs because of a permanent change in traffic patterns. +# In the "impact" strategy, the items having the most impact on the cache size are +# ejected from the cache earlier than normal but the cache is not resized. +# In all cases, it only applies if MaxAlloc is nonzero. +# Default is "resize" for compatibility but "impact" is recommended for most installations. +CacheOverrunStrategy: "impact" + +######################### +## Retry Configuration ## +######################### +RetryConfiguration: + # InitialInterval the time to wait after the first failure before retrying. + InitialInterval: 500ms + # RandomizationFactor is a random factor used to calculate next backoff + # Randomized interval = RetryInterval * (1 ± RandomizationFactor) + RandomizationFactor: 0.5 + # Multiplier is the value multiplied by the backoff interval bounds + Multiplier: 1.5 + # MaxInterval is the upper bound on backoff interval. Once this value is reached, the delay between + # consecutive retries will always be `MaxInterval`. + MaxInterval: 60s + # MaxElapsedTime is the maximum amount of time (including retries) spent trying to send a request. + # Once this value is reached, the data is discarded. + MaxElapsedTime: 15m + +######################### +## Proxy Configuration ## +######################### +ProxyConfiguration: + # Protocol accepts http and https + Protocol: "http" + # Host takes the proxy server address + Host: "" + # Port takes the proxy server port + Port: 3128 + # UserName takes the proxy username + Username: "" + # Password takes the proxy password + Password: "" + +################################## +## Authentication Configuration ## +################################## +AuthConfiguration: + # Endpoint - the APIServer address provided in OpsRamp Portal to which auth token request is to be made + Endpoint: "https://portal.opsramp.net" + # Key - authentication key provided in OpsRamp Portal + Key: "super-secret-key" + # Secret - authentication Secret provided in OpsRamp Portal + Secret: "super-secret-secret" + # TenantId - tenant/client id to which the traces are to be posted + TenantId: "super-secret-tenantId" + +############################ +## Implementation Choices ## +############################ +# Each of the config options below chooses an implementation of a Trace Proxy +# component to use. Depending on the choice, there may be more configuration +# required below in the section for that choice. Changing implementation choices +# requires a process restart. +# Collector describes which collector to use for collecting traces. The only +# current valid option is "InMemCollector". More can be added by adding +# implementations of the Collector interface. +Collector: "InMemCollector" + +# InMemCollector brings together all the settings that are relevant to +# collecting spans together to make traces. +InMemCollector: + + # The collection cache is used to collect all spans into a trace as well as + # remember the sampling decision for any spans that might come in after the + # trace has been marked "complete" (either by timing out or seeing the root + # span). The number of traces in the cache should be many multiples (100x to + # 1000x) of the total number of concurrently active traces (trace throughput * + # trace duration). + CacheCapacity: 1000 + + # MaxAlloc is optional. If set, it must be an integer >= 0. + # If set to a non-zero value, once per tick (see SendTicker) the collector + # will compare total allocated bytes to this value. If allocation is too + # high, cache capacity will be reduced and an error will be logged. + # Useful values for this setting are generally in the range of 75%-90% of + # available system memory. Using 80% is the recommended. + # This value should be set in according to the resources.limits.memory + # By default that setting is 4GB, and this is set to 85% of that limit + # 4 * 1024 * 1024 * 1024 * 0.80 = 3,435,973,837 + # MaxAlloc: 3435973836 + MaxAlloc: 0 + +##################### +## Peer Management ## +##################### + +# Configure how OpsRamp-Tracing-Proxy peers are discovered and managed +PeerManagement: + # Strategy controls the way that traces are assigned to Trace Proxy nodes. + # The "legacy" strategy uses a simple algorithm that unfortunately causes + # 1/2 of the in-flight traces to be assigned to a different node whenever the + # number of nodes changes. + # The legacy strategy is deprecated and is intended to be removed in a future release. + # The "hash" strategy is strongly recommended, as only 1/N traces (where N is the + # number of nodes) are disrupted when the node count changes. + # Not eligible for live reload. + Strategy: "hash" + + ########################################################### + ###### File (Suitable only for VM based deployments ###### + ###### and single replica k8s deployments) ###### + ########################################################### + #Type: "file" + + # Peers is the list of all servers participating in this proxy cluster. Events + # will be sharded evenly across all peers based on the Trace ID. Values here + # should be the base URL used to access the peer, and should include scheme, + # hostname (or ip address) and port. All servers in the cluster should be in + # this list, including this host. + #Peers: [ + # "http://127.0.0.1:8084", #only grpc peer listener used + #] + ########################################################### + + ########################################################### + ###### Redis (Suitable for all types of deployments) ###### + ########################################################### + ## The type should always be redis when deployed to Kubernetes environments + Type: "redis" + + ## RedisHost is used to connect to redis for peer cluster membership management. + ## Further, if the environment variable 'TRACING_PROXY_REDIS_HOST' is set it takes + ## precedence and this value is ignored. + ## Not eligible for live reload. + ## RedisHost will default to the name used for the release or name overrides depending on what is used, + ## but can be overriden to a specific value. + RedisHost: "127.0.0.1" + + ## RedisUsername is the username used to connect to redis for peer cluster membership management. + ## If the environment variable 'TRACING_PROXY_REDIS_USERNAME' is set it takes + ## precedence and this value is ignored. + ## Not eligible for live reload. + RedisUsername: "" + + ## RedisPassword is the password used to connect to redis for peer cluster membership management. + ## If the environment variable 'TRACING_PROXY_REDIS_PASSWORD' is set it takes + ## precedence and this value is ignored. + ## Not eligible for live reload. + RedisPassword: "" + + ## RedisPrefix is a string used as a prefix for the keys in redis while storing + ## the peer membership. It might be useful to set this in any situation where + ## multiple trace-proxy clusters or multiple applications want to share a single + ## Redis instance. It may not be blank. + RedisPrefix: "tracing-proxy" + + ## RedisDatabase is an integer from 0-15 indicating the database number to use + ## for the Redis instance storing the peer membership. It might be useful to set + ## this in any situation where multiple trace-proxy clusters or multiple + ## applications want to share a single Redis instance. + RedisDatabase: 0 + + ## UseTLS enables TLS when connecting to redis for peer cluster membership management, and sets the MinVersion to 1.2. + ## Not eligible for live reload. + UseTLS: false + + ## UseTLSInsecure disables certificate checks + ## Not eligible for live reload. + UseTLSInsecure: true + + ## IdentifierInterfaceName is optional. + ## Due to the nature of DNS in Kubernetes, it is recommended to set this value to the 'eth0' interface name. + ## When configured the pod's IP will be used in the peer list + IdentifierInterfaceName: eth0 + + ## UseIPV6Identifier is optional. If using IdentifierInterfaceName, Trace Proxy will default to the first + ## IPv4 unicast address it finds for the specified interface. If UseIPV6Identifier is used, will use + ## the first IPV6 unicast address found. + UseIPV6Identifier: false + ########################################################### + +# LogrusLogger is a section of the config only used if you are using the +# LogrusLogger to send all logs to STDOUT using the logrus package. +LogrusLogger: + # LogFormatter specifies the log format. Accepted values are one of ["logfmt", "json"] + LogFormatter: 'json' + # LogOutput specifies where the logs are supposed to be written. Accpets one of ["stdout", "stderr"] + LogOutput: 'stdout' + +MetricsConfig: + # Enable specifies whether the metrics are supposed to be collected and exported to OpsRamp + Enable: true + + # ListenAddr determines the interface and port on which Prometheus will + # listen for requests for /metrics. Must be different from the main Trace Proxy + # listener. + ListenAddr: '0.0.0.0:2112' + + # OpsRampAPI is the URL for the upstream OpsRamp API. + OpsRampAPI: "https://portal.opsramp.net" + + # ReportingInterval is the frequency specified in seconds at which + # the metrics are collected and sent to OpsRamp + ReportingInterval: 10 + + # MetricsList is a list of regular expressions which match the metric + # names. Keep the list as small as possible since too many regular expressions can lead to bad performance. + # Internally, all the items in the list are concatenated using '|' to make the computation faster. + MetricsList: [ ".*" ] + +GRPCServerParameters: +# MaxConnectionIdle is a duration for the amount of time after which an +# idle connection would be closed by sending a GoAway. Idleness duration is +# defined since the most recent time the number of outstanding RPCs became +# zero or the connection establishment. +# 0s sets duration to infinity which is the default: +# https://github.com/grpc/grpc-go/blob/60a3a7e969c401ca16dbcd0108ad544fb35aa61c/internal/transport/http2_server.go#L217-L219 +# MaxConnectionIdle: "1m" + +# MaxConnectionAge is a duration for the maximum amount of time a +# connection may exist before it will be closed by sending a GoAway. A +# random jitter of +/-10% will be added to MaxConnectionAge to spread out +# connection storms. +# 0s sets duration to infinity which is the default: +# https://github.com/grpc/grpc-go/blob/60a3a7e969c401ca16dbcd0108ad544fb35aa61c/internal/transport/http2_server.go#L220-L222 +# MaxConnectionAge: "0s" + +# MaxConnectionAgeGrace is an additive period after MaxConnectionAge after +# which the connection will be forcibly closed. +# 0s sets duration to infinity which is the default: +# https://github.com/grpc/grpc-go/blob/60a3a7e969c401ca16dbcd0108ad544fb35aa61c/internal/transport/http2_server.go#L225-L227 +# MaxConnectionAgeGrace: "0s" + +# After a duration of this time if the server doesn't see any activity it +# pings the client to see if the transport is still alive. +# If set below 1s, a minimum value of 1s will be used instead. +# 0s sets duration to 2 hours which is the default: +# https://github.com/grpc/grpc-go/blob/60a3a7e969c401ca16dbcd0108ad544fb35aa61c/internal/transport/http2_server.go#L228-L230 +# Time: "10s" + +# After having pinged for keepalive check, the server waits for a duration +# of Timeout and if no activity is seen even after that the connection is +# closed. +# 0s sets duration to 20 seconds which is the default: +# https://github.com/grpc/grpc-go/blob/60a3a7e969c401ca16dbcd0108ad544fb35aa61c/internal/transport/http2_server.go#L231-L233 +# Timeout: "2s" + +################################ +## Sample Cache Configuration ## +################################ + +# Sample Cache Configuration controls the sample cache used to retain information about trace +# status after the sampling decision has been made. +SampleCacheConfig: +# Type controls the type of sample cache used. +# "legacy" is a strategy where both keep and drop decisions are stored in a circular buffer that is +# 5x the size of the trace cache. This is tracing proxy's original sample cache strategy. +# "cuckoo" is a strategy where dropped traces are preserved in a "Cuckoo Filter", which can remember +# a much larger number of dropped traces, leaving capacity to retain a much larger number of kept traces. +# It is also more configurable. The cuckoo filter is recommended for most installations. +# Default is "legacy". +# Type: "cuckoo" + +# KeptSize controls the number of traces preserved in the cuckoo kept traces cache. +# tracing proxy keeps a record of each trace that was kept and sent to OpsRamp, along with some +# statistical information. This is most useful in cases where the trace was sent before sending +# the root span, so that the root span can be decorated with accurate metadata. +# Default is 10_000 traces (each trace in this cache consumes roughly 200 bytes). +# It Does not apply to the "legacy" type of cache. +# KeptSize: 10_000 + +# DroppedSize controls the size of the cuckoo dropped traces cache. +# This cache consumes 4-6 bytes per trace at a scale of millions of traces. +# Changing its size with live reload sets a future limit, but does not have an immediate effect. +# Default is 1_000_000 traces. +# It Does not apply to the "legacy" type of cache. +# DroppedSize: 1_000_000 + +# SizeCheckInterval controls the duration of how often the cuckoo cache re-evaluates +# the remaining capacity of its dropped traces cache and possibly cycles it. +# This cache is quite resilient so it doesn't need to happen very often, but the +# operation is also inexpensive. +# Default is 10 seconds. +# It Does not apply to the "legacy" type of cache. +# SizeCheckInterval: "10s" \ No newline at end of file diff --git a/rules_unit_tests.yaml b/rules_unit_tests.yaml new file mode 100644 index 0000000000..99cdc4ae03 --- /dev/null +++ b/rules_unit_tests.yaml @@ -0,0 +1,258 @@ +############################ +## Sampling Rules Config ## +############################ + +# DryRun - If enabled, marks traces that would be dropped given current sampling rules, +# and sends all traces regardless +DryRun: false + +# DryRunFieldName - the key to add to use to add to event data when using DryRun mode above, defaults to trace_proxy_kept +DryRunFieldName: trace_proxy_kept + +# DeterministicSampler is a section of the config for manipulating the +# Deterministic Sampler implementation. This is the simplest sampling algorithm +# - it is a static sample rate, choosing traces randomly to either keep or send +# (at the appropriate rate). It is not influenced by the contents of the trace. +Sampler: DeterministicSampler + +# SampleRate is the rate at which to sample. It indicates a ratio, where one +# sample trace is kept for every n traces seen. For example, a SampleRate of 30 +# will keep 1 out of every 30 traces. The choice on whether to keep any specific +# trace is random, so the rate is approximate. +# Eligible for live reload. +SampleRate: 1 + +dataset1: + + # Note: If your dataset name contains a space, you will have to escape the dataset name + # using single quotes, such as ['dataset 1'] + + # DynamicSampler is a section of the config for manipulating the simple Dynamic Sampler + # implementation. This sampler collects the values of a number of fields from a + # trace and uses them to form a key. This key is handed to the standard dynamic + # sampler algorithm which generates a sample rate based on the frequency with + # which that key has appeared in the previous ClearFrequencySec seconds. + Sampler: DynamicSampler + + # SampleRate is the goal rate at which to sample. It indicates a ratio, where + # one sample trace is kept for every n traces seen. For example, a SampleRate of + # 30 will keep 1 out of every 30 traces. This rate is handed to the dynamic + # sampler, who assigns a sample rate for each trace based on the fields selected + # from that trace. + SampleRate: 2 + + # FieldList is a list of all the field names to use to form the key that will be handed to the dynamic sampler. + # The combination of values from all of these fields should reflect how interesting the trace is compared to + # another. A good field selection has consistent values for high-frequency, boring traffic, and unique values for + # outliers and interesting traffic. Including an error field (or something like HTTP status code) is an excellent + # choice. Using fields with very high cardinality (like `k8s.pod.id`), is a bad choice. If the combination of + # fields essentially makes them unique, the dynamic sampler will sample everything. If the combination of fields is + # not unique enough, you will not be guaranteed samples of the most interesting traces. As an example, consider a + # combination of HTTP endpoint (high-frequency and boring), HTTP method, and status code (normally boring but can + # become interesting when indicating an error) as a good set of fields since it will allowing proper sampling + # of all endpoints under normal traffic and call out when there is failing traffic to any endpoint. + # For example, in contrast, consider a combination of HTTP endpoint, status code, and pod id as a bad set of + # fields, since it would result in keys that are all unique, and therefore results in sampling 100% of traces. + # Using only the HTTP endpoint field would be a **bad** choice, as it is not unique enough and therefore + # interesting traces, like traces that experienced a `500`, might not be sampled. + # Field names may come from any span in the trace. + FieldList: + - request.method + - http.target + - response.status_code + + # UseTraceLength will add the number of spans in the trace in to the dynamic + # sampler as part of the key. The number of spans is exact, so if there are + # normally small variations in trace length you may want to leave this off. If + # traces are consistent lengths and changes in trace length is a useful + # indicator of traces you'd like to see in OpsRamp, set this to true. + UseTraceLength: true + + # AddSampleRateKeyToTrace when this is set to true, the sampler will add a field + # to the root span of the trace containing the key used by the sampler to decide + # the sample rate. This can be helpful in understanding why the sampler is + # making certain decisions about sample rate and help you understand how to + # better choose the sample rate key (aka the FieldList setting above) to use. + AddSampleRateKeyToTrace: true + + # AddSampleRateKeyToTraceField is the name of the field the sampler will use + # when adding the sample rate key to the trace. This setting is only used when + # AddSampleRateKeyToTrace is true. + AddSampleRateKeyToTraceField: meta.tracing-proxy.dynsampler_key + + # ClearFrequencySec is the name of the field the sampler will use to determine + # the period over which it will calculate the sample rate. This setting defaults + # to 30. + ClearFrequencySec: 60 +dataset2: + + # EMADynamicSampler is a section of the config for manipulating the Exponential + # Moving Average (EMA) Dynamic Sampler implementation. Like the simple DynamicSampler, + # it attempts to average a given sample rate, weighting rare traffic and frequent + # traffic differently so as to end up with the correct average. + # + # EMADynamicSampler is an improvement upon the simple DynamicSampler and is recommended + # for most use cases. Based on the DynamicSampler implementation, EMADynamicSampler differs + # in that rather than compute rate based on a periodic sample of traffic, it maintains an Exponential + # Moving Average of counts seen per key, and adjusts this average at regular intervals. + # The weight applied to more recent intervals is defined by `weight`, a number between + # (0, 1) - larger values weight the average more toward recent observations. In other words, + # a larger weight will cause sample rates more quickly adapt to traffic patterns, + # while a smaller weight will result in sample rates that are less sensitive to bursts or drops + # in traffic and thus more consistent over time. + # + # Keys that are not found in the EMA will always have a sample + # rate of 1. Keys that occur more frequently will be sampled on a logarithmic + # curve. In other words, every key will be represented at least once in any + # given window and more frequent keys will have their sample rate + # increased proportionally to wind up with the goal sample rate. + Sampler: EMADynamicSampler + + # GoalSampleRate is the goal rate at which to sample. It indicates a ratio, where + # one sample trace is kept for every n traces seen. For example, a SampleRate of + # 30 will keep 1 out of every 30 traces. This rate is handed to the dynamic + # sampler, who assigns a sample rate for each trace based on the fields selected + # from that trace. + GoalSampleRate: 2 + + # FieldList is a list of all the field names to use to form the key that will be handed to the dynamic sampler. + # The combination of values from all of these fields should reflect how interesting the trace is compared to + # another. A good field selection has consistent values for high-frequency, boring traffic, and unique values for + # outliers and interesting traffic. Including an error field (or something like HTTP status code) is an excellent + # choice. Using fields with very high cardinality (like `k8s.pod.id`), is a bad choice. If the combination of + # fields essentially makes them unique, the dynamic sampler will sample everything. If the combination of fields is + # not unique enough, you will not be guaranteed samples of the most interesting traces. As an example, consider a + # combination of HTTP endpoint (high-frequency and boring), HTTP method, and status code (normally boring but can + # become interesting when indicating an error) as a good set of fields since it will allowing proper sampling + # of all endpoints under normal traffic and call out when there is failing traffic to any endpoint. + # For example, in contrast, consider a combination of HTTP endpoint, status code, and pod id as a bad set of + # fields, since it would result in keys that are all unique, and therefore results in sampling 100% of traces. + # Using only the HTTP endpoint field would be a **bad** choice, as it is not unique enough and therefore + # interesting traces, like traces that experienced a `500`, might not be sampled. + # Field names may come from any span in the trace. + FieldList: + - request.method + - http.target + - response.status_code + + # UseTraceLength will add the number of spans in the trace in to the dynamic + # sampler as part of the key. The number of spans is exact, so if there are + # normally small variations in trace length you may want to leave this off. If + # traces are consistent lengths and changes in trace length is a useful + # indicator of traces you'd like to see in OpsRamp, set this to true. + UseTraceLength: true + + # AddSampleRateKeyToTrace when this is set to true, the sampler will add a field + # to the root span of the trace containing the key used by the sampler to decide + # the sample rate. This can be helpful in understanding why the sampler is + # making certain decisions about sample rate and help you understand how to + # better choose the sample rate key (aka the FieldList setting above) to use. + AddSampleRateKeyToTrace: true + + # AddSampleRateKeyToTraceField is the name of the field the sampler will use + # when adding the sample rate key to the trace. This setting is only used when + # AddSampleRateKeyToTrace is true. + AddSampleRateKeyToTraceField: meta.tracing-proxy.dynsampler_key + + # AdjustmentInterval defines how often (in seconds) we adjust the moving average from + # recent observations. Default 15s + AdjustmentInterval: 15 + + # Weight is a value between (0, 1) indicating the weighting factor used to adjust + # the EMA. With larger values, newer data will influence the average more, and older + # values will be factored out more quickly. In mathematical literature concerning EMA, + # this is referred to as the `alpha` constant. + # Default is 0.5 + Weight: 0.5 + + # MaxKeys, if greater than 0, limits the number of distinct keys tracked in EMA. + # Once MaxKeys is reached, new keys will not be included in the sample rate map, but + # existing keys will continue to be be counted. You can use this to keep the sample rate + # map size under control. + MaxKeys: 0 + + # AgeOutValue indicates the threshold for removing keys from the EMA. The EMA of any key + # will approach 0 if it is not repeatedly observed, but will never truly reach it, so we have to + # decide what constitutes "zero". Keys with averages below this threshold will be removed + # from the EMA. Default is the same as Weight, as this prevents a key with the smallest + # integer value (1) from being aged out immediately. This value should generally be <= Weight, + # unless you have very specific reasons to set it higher. + AgeOutValue: 0.5 + + # BurstMultiple, if set, is multiplied by the sum of the running average of counts to define + # the burst detection threshold. If total counts observed for a given interval exceed the threshold + # EMA is updated immediately, rather than waiting on the AdjustmentInterval. + # Defaults to 2; negative value disables. With a default of 2, if your traffic suddenly doubles, + # burst detection will kick in. + BurstMultiple: 2 + + # BurstDetectionDelay indicates the number of intervals to run after Start is called before + # burst detection kicks in. + # Defaults to 3 + BurstDetectionDelay: 3 +dataset3: + Sampler: DeterministicSampler + SampleRate: 10 +dataset4: + Sampler: RulesBasedSampler + CheckNestedFields: false + rule: + - name: drop healthchecks + drop: true + condition: + - field: http.route + operator: '=' + value: /health-check + - name: keep slow 500 errors + SampleRate: 1 + condition: + - field: status_code + operator: '=' + value: 500 + - field: duration_ms + operator: '>=' + value: 1000.789 + - name: dynamically sample 200 responses + condition: + - field: status_code + operator: '=' + value: 200 + sampler: + EMADynamicSampler: + Sampler: EMADynamicSampler + GoalSampleRate: 15 + FieldList: + - request.method + - request.route + AddSampleRateKeyToTrace: true + AddSampleRateKeyToTraceField: meta.tracing-proxy.dynsampler_key + - name: dynamically sample 200 string responses + condition: + - field: status_code + operator: '=' + value: '200' + datatype: int + sampler: + EMADynamicSampler: + Sampler: EMADynamicSampler + GoalSampleRate: 15 + FieldList: + - request.method + - request.route + AddSampleRateKeyToTrace: true + AddSampleRateKeyToTraceField: meta.tracing-proxy.dynsampler_key + - name: sample traces originating from a service + Scope: span + SampleRate: 5 + condition: + - field: service name + operator: '=' + value: users + - field: meta.span_type + operator: '=' + value: root + - SampleRate: 10 +dataset5: + Sampler: TotalThroughputSampler + GoalThroughputPerSec: 100 + FieldList: '[request.method]'