forked from jirs5/tracing-proxy
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathconfig_unit_tests.yaml
406 lines (346 loc) · 18.9 KB
/
config_unit_tests.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
########################
## Trace Proxy Config ##
########################
# ListenAddr is the IP and port on which to listen for incoming events. Incoming
# traffic is expected to be HTTP, so if using SSL put something like nginx in
# front to do the TLS Termination.
ListenAddr: 0.0.0.0:8082
# GRPCListenAddr is the IP and port on which to listen for incoming events over
# gRPC. Incoming traffic is expected to be unencrypted, so if using SSL put something like nginx in
# front to do the TLS Termination.
GRPCListenAddr: 0.0.0.0:9090
# PeerListenAddr is the IP and port on which to listen for traffic being
# rerouted from a peer. Peer traffic is expected to be HTTP, so if using SSL
# put something like nginx in front to do the decryption. Must be different from
# ListenAddr
PeerListenAddr: 0.0.0.0:8083
GRPCPeerListenAddr: 0.0.0.0:8084
# CompressPeerCommunication determines whether to compress span data
# it forwards to peers. If it costs money to transmit data between different
# instances (e.g. they're spread across AWS availability zones), then you
# almost certainly want compression enabled to reduce your bill. The option to
# disable it is provided as an escape hatch for deployments that value lower CPU
# utilization over data transfer costs.
CompressPeerCommunication: true
# OpsrampAPI is the URL for the upstream Opsramp API.
OpsrampAPI: "https://portal.opsramp.net"
# Dataset you want to use for sampling
Dataset: "ds"
#Tls Options
UseTls: true
UseTlsInsecure: false
# LoggingLevel valid options are "debug", "info", "error", and "panic".
LoggingLevel: error
# SendDelay is a short timer that will be triggered when a trace is complete.
# Trace Proxy will wait for this duration before actually sending the trace. The
# reason for this short delay is to allow for small network delays or clock
# jitters to elapse and any final spans to arrive before actually sending the
# trace. This supports duration strings with supplied units. Set to 0 for
# immediate sends.
SendDelay: 2s
# BatchTimeout dictates how frequently to send unfulfilled batches. By default
# this will use the DefaultBatchTimeout in libtrace as its value, which is 100ms.
# Eligible for live reload.
BatchTimeout: 1s
# TraceTimeout is a long timer; it represents the outside boundary of how long
# to wait before sending an incomplete trace. Normally traces are sent when the
# root span arrives. Sometimes the root span never arrives (due to crashes or
# whatever), and this timer will send a trace even without having received the
# root span. If you have particularly long-lived traces you should increase this
# timer. This supports duration strings with supplied units.
TraceTimeout: 60s
# MaxBatchSize is the number of events to be included in the batch for sending
MaxBatchSize: 500
# SendTicker is a short timer; it determines the duration to use to check for traces to send
SendTicker: 100ms
# UpstreamBufferSize and PeerBufferSize control how large of an event queue to use
# when buffering events that will be forwarded to peers or the upstream API.
UpstreamBufferSize: 1000
PeerBufferSize: 1000
# AddHostMetadataToTrace determines whether to add information about
# the host that tracing proxy is running on to the spans that it processes.
# If enabled, information about the host will be added to each span with the
# key 'meta.local_hostname'.
AddHostMetadataToTrace: false
# AddAdditionalMetadata adds all the specified key value pairs to the traces and metrics
# the values must be a valid json key value pair like eg: {"key_1":"value_1", "key_2":"value_2"}
# max number of additional keys supported is 5, if the limit exceeds then we considered the first 5
# based on sorted order of keys
# "app" label is mandatory
AddAdditionalMetadata: { "app": "default" }
# EnvironmentCacheTTL is the amount of time a cache entry will live that associates
# an API key with an environment name.
# Cache misses lookup the environment name using OpsRampAPI config value.
# Default is 1 hour ("1h").
EnvironmentCacheTTL: "1h"
# QueryAuthToken, if specified, provides a token that must be specified with
# the header "X-OpsRamp-Tracing-Proxy-Query" in order for a /query request to succeed.
# These /query requests are intended for debugging OpsRamp-Tracing-Proxy installations and
# are not typically needed in normal operation.
# Can be specified in the environment as TRACING_PROXY_QUERY_AUTH_TOKEN.
# If left unspecified, the /query endpoints are inaccessible.
# QueryAuthToken: "some-random-value"
# AddRuleReasonToTrace causes traces that are sent to OpsRamp to include a field which
# contains text indicating which rule was evaluated that caused the trace to be included.
AddRuleReasonToTrace: true
# AdditionalErrorFields should be a list of span fields that should be included when logging
# errors that happen during ingestion of events (for example, the span too large error).
# This is primarily useful in trying to track down misbehaving senders in a large installation.
# The fields `dataset`, `apihost`, and `environment` are always included.
# If a field is not present in the span, it will not be present in the error log.
# Default is ["trace.span_id"].
AdditionalErrorFields:
- trace.span_id
# AddSpanCountToRoot adds a new metadata field, `meta.span_count` to root spans to indicate
# the number of child spans on the trace at the time the sampling decision was made.
# This value is available to the rules-based sampler, making it possible to write rules that
# are dependent upon the number of spans in the trace.
# Default is false.
AddSpanCountToRoot: false
# CacheOverrunStrategy controls the cache management behavior under memory pressure.
# "resize" means that when a cache overrun occurs, the cache is shrunk and never grows again,
# which is generally not helpful unless it occurs because of a permanent change in traffic patterns.
# In the "impact" strategy, the items having the most impact on the cache size are
# ejected from the cache earlier than normal but the cache is not resized.
# In all cases, it only applies if MaxAlloc is nonzero.
# Default is "resize" for compatibility but "impact" is recommended for most installations.
CacheOverrunStrategy: "impact"
#########################
## Retry Configuration ##
#########################
RetryConfiguration:
# InitialInterval the time to wait after the first failure before retrying.
InitialInterval: 500ms
# RandomizationFactor is a random factor used to calculate next backoff
# Randomized interval = RetryInterval * (1 ± RandomizationFactor)
RandomizationFactor: 0.5
# Multiplier is the value multiplied by the backoff interval bounds
Multiplier: 1.5
# MaxInterval is the upper bound on backoff interval. Once this value is reached, the delay between
# consecutive retries will always be `MaxInterval`.
MaxInterval: 60s
# MaxElapsedTime is the maximum amount of time (including retries) spent trying to send a request.
# Once this value is reached, the data is discarded.
MaxElapsedTime: 15m
#########################
## Proxy Configuration ##
#########################
ProxyConfiguration:
# Protocol accepts http and https
Protocol: "http"
# Host takes the proxy server address
Host: ""
# Port takes the proxy server port
Port: 3128
# UserName takes the proxy username
Username: ""
# Password takes the proxy password
Password: ""
##################################
## Authentication Configuration ##
##################################
AuthConfiguration:
# Endpoint - the APIServer address provided in OpsRamp Portal to which auth token request is to be made
Endpoint: "https://portal.opsramp.net"
# Key - authentication key provided in OpsRamp Portal
Key: "super-secret-key"
# Secret - authentication Secret provided in OpsRamp Portal
Secret: "super-secret-secret"
# TenantId - tenant/client id to which the traces are to be posted
TenantId: "super-secret-tenantId"
############################
## Implementation Choices ##
############################
# Each of the config options below chooses an implementation of a Trace Proxy
# component to use. Depending on the choice, there may be more configuration
# required below in the section for that choice. Changing implementation choices
# requires a process restart.
# Collector describes which collector to use for collecting traces. The only
# current valid option is "InMemCollector". More can be added by adding
# implementations of the Collector interface.
Collector: "InMemCollector"
# InMemCollector brings together all the settings that are relevant to
# collecting spans together to make traces.
InMemCollector:
# The collection cache is used to collect all spans into a trace as well as
# remember the sampling decision for any spans that might come in after the
# trace has been marked "complete" (either by timing out or seeing the root
# span). The number of traces in the cache should be many multiples (100x to
# 1000x) of the total number of concurrently active traces (trace throughput *
# trace duration).
CacheCapacity: 1000
# MaxAlloc is optional. If set, it must be an integer >= 0.
# If set to a non-zero value, once per tick (see SendTicker) the collector
# will compare total allocated bytes to this value. If allocation is too
# high, cache capacity will be reduced and an error will be logged.
# Useful values for this setting are generally in the range of 75%-90% of
# available system memory. Using 80% is the recommended.
# This value should be set in according to the resources.limits.memory
# By default that setting is 4GB, and this is set to 85% of that limit
# 4 * 1024 * 1024 * 1024 * 0.80 = 3,435,973,837
# MaxAlloc: 3435973836
MaxAlloc: 0
#####################
## Peer Management ##
#####################
# Configure how OpsRamp-Tracing-Proxy peers are discovered and managed
PeerManagement:
# Strategy controls the way that traces are assigned to Trace Proxy nodes.
# The "legacy" strategy uses a simple algorithm that unfortunately causes
# 1/2 of the in-flight traces to be assigned to a different node whenever the
# number of nodes changes.
# The legacy strategy is deprecated and is intended to be removed in a future release.
# The "hash" strategy is strongly recommended, as only 1/N traces (where N is the
# number of nodes) are disrupted when the node count changes.
# Not eligible for live reload.
Strategy: "hash"
###########################################################
###### File (Suitable only for VM based deployments ######
###### and single replica k8s deployments) ######
###########################################################
#Type: "file"
# Peers is the list of all servers participating in this proxy cluster. Events
# will be sharded evenly across all peers based on the Trace ID. Values here
# should be the base URL used to access the peer, and should include scheme,
# hostname (or ip address) and port. All servers in the cluster should be in
# this list, including this host.
#Peers: [
# "http://127.0.0.1:8084", #only grpc peer listener used
#]
###########################################################
###########################################################
###### Redis (Suitable for all types of deployments) ######
###########################################################
## The type should always be redis when deployed to Kubernetes environments
Type: "redis"
## RedisHost is used to connect to redis for peer cluster membership management.
## Further, if the environment variable 'TRACING_PROXY_REDIS_HOST' is set it takes
## precedence and this value is ignored.
## Not eligible for live reload.
## RedisHost will default to the name used for the release or name overrides depending on what is used,
## but can be overriden to a specific value.
RedisHost: "127.0.0.1"
## RedisUsername is the username used to connect to redis for peer cluster membership management.
## If the environment variable 'TRACING_PROXY_REDIS_USERNAME' is set it takes
## precedence and this value is ignored.
## Not eligible for live reload.
RedisUsername: ""
## RedisPassword is the password used to connect to redis for peer cluster membership management.
## If the environment variable 'TRACING_PROXY_REDIS_PASSWORD' is set it takes
## precedence and this value is ignored.
## Not eligible for live reload.
RedisPassword: ""
## RedisPrefix is a string used as a prefix for the keys in redis while storing
## the peer membership. It might be useful to set this in any situation where
## multiple trace-proxy clusters or multiple applications want to share a single
## Redis instance. It may not be blank.
RedisPrefix: "tracing-proxy"
## RedisDatabase is an integer from 0-15 indicating the database number to use
## for the Redis instance storing the peer membership. It might be useful to set
## this in any situation where multiple trace-proxy clusters or multiple
## applications want to share a single Redis instance.
RedisDatabase: 0
## UseTLS enables TLS when connecting to redis for peer cluster membership management, and sets the MinVersion to 1.2.
## Not eligible for live reload.
UseTLS: false
## UseTLSInsecure disables certificate checks
## Not eligible for live reload.
UseTLSInsecure: true
## IdentifierInterfaceName is optional.
## Due to the nature of DNS in Kubernetes, it is recommended to set this value to the 'eth0' interface name.
## When configured the pod's IP will be used in the peer list
IdentifierInterfaceName: eth0
## UseIPV6Identifier is optional. If using IdentifierInterfaceName, Trace Proxy will default to the first
## IPv4 unicast address it finds for the specified interface. If UseIPV6Identifier is used, will use
## the first IPV6 unicast address found.
UseIPV6Identifier: false
###########################################################
# LogrusLogger is a section of the config only used if you are using the
# LogrusLogger to send all logs to STDOUT using the logrus package.
LogrusLogger:
# LogFormatter specifies the log format. Accepted values are one of ["logfmt", "json"]
LogFormatter: 'json'
# LogOutput specifies where the logs are supposed to be written. Accpets one of ["stdout", "stderr"]
LogOutput: 'stdout'
MetricsConfig:
# Enable specifies whether the metrics are supposed to be collected and exported to OpsRamp
Enable: true
# ListenAddr determines the interface and port on which Prometheus will
# listen for requests for /metrics. Must be different from the main Trace Proxy
# listener.
ListenAddr: '0.0.0.0:2112'
# OpsRampAPI is the URL for the upstream OpsRamp API.
OpsRampAPI: "https://portal.opsramp.net"
# ReportingInterval is the frequency specified in seconds at which
# the metrics are collected and sent to OpsRamp
ReportingInterval: 10
# MetricsList is a list of regular expressions which match the metric
# names. Keep the list as small as possible since too many regular expressions can lead to bad performance.
# Internally, all the items in the list are concatenated using '|' to make the computation faster.
MetricsList: [ ".*" ]
GRPCServerParameters:
# MaxConnectionIdle is a duration for the amount of time after which an
# idle connection would be closed by sending a GoAway. Idleness duration is
# defined since the most recent time the number of outstanding RPCs became
# zero or the connection establishment.
# 0s sets duration to infinity which is the default:
# https://github.com/grpc/grpc-go/blob/60a3a7e969c401ca16dbcd0108ad544fb35aa61c/internal/transport/http2_server.go#L217-L219
# MaxConnectionIdle: "1m"
# MaxConnectionAge is a duration for the maximum amount of time a
# connection may exist before it will be closed by sending a GoAway. A
# random jitter of +/-10% will be added to MaxConnectionAge to spread out
# connection storms.
# 0s sets duration to infinity which is the default:
# https://github.com/grpc/grpc-go/blob/60a3a7e969c401ca16dbcd0108ad544fb35aa61c/internal/transport/http2_server.go#L220-L222
# MaxConnectionAge: "0s"
# MaxConnectionAgeGrace is an additive period after MaxConnectionAge after
# which the connection will be forcibly closed.
# 0s sets duration to infinity which is the default:
# https://github.com/grpc/grpc-go/blob/60a3a7e969c401ca16dbcd0108ad544fb35aa61c/internal/transport/http2_server.go#L225-L227
# MaxConnectionAgeGrace: "0s"
# After a duration of this time if the server doesn't see any activity it
# pings the client to see if the transport is still alive.
# If set below 1s, a minimum value of 1s will be used instead.
# 0s sets duration to 2 hours which is the default:
# https://github.com/grpc/grpc-go/blob/60a3a7e969c401ca16dbcd0108ad544fb35aa61c/internal/transport/http2_server.go#L228-L230
# Time: "10s"
# After having pinged for keepalive check, the server waits for a duration
# of Timeout and if no activity is seen even after that the connection is
# closed.
# 0s sets duration to 20 seconds which is the default:
# https://github.com/grpc/grpc-go/blob/60a3a7e969c401ca16dbcd0108ad544fb35aa61c/internal/transport/http2_server.go#L231-L233
# Timeout: "2s"
################################
## Sample Cache Configuration ##
################################
# Sample Cache Configuration controls the sample cache used to retain information about trace
# status after the sampling decision has been made.
SampleCacheConfig:
# Type controls the type of sample cache used.
# "legacy" is a strategy where both keep and drop decisions are stored in a circular buffer that is
# 5x the size of the trace cache. This is tracing proxy's original sample cache strategy.
# "cuckoo" is a strategy where dropped traces are preserved in a "Cuckoo Filter", which can remember
# a much larger number of dropped traces, leaving capacity to retain a much larger number of kept traces.
# It is also more configurable. The cuckoo filter is recommended for most installations.
# Default is "legacy".
# Type: "cuckoo"
# KeptSize controls the number of traces preserved in the cuckoo kept traces cache.
# tracing proxy keeps a record of each trace that was kept and sent to OpsRamp, along with some
# statistical information. This is most useful in cases where the trace was sent before sending
# the root span, so that the root span can be decorated with accurate metadata.
# Default is 10_000 traces (each trace in this cache consumes roughly 200 bytes).
# It Does not apply to the "legacy" type of cache.
# KeptSize: 10_000
# DroppedSize controls the size of the cuckoo dropped traces cache.
# This cache consumes 4-6 bytes per trace at a scale of millions of traces.
# Changing its size with live reload sets a future limit, but does not have an immediate effect.
# Default is 1_000_000 traces.
# It Does not apply to the "legacy" type of cache.
# DroppedSize: 1_000_000
# SizeCheckInterval controls the duration of how often the cuckoo cache re-evaluates
# the remaining capacity of its dropped traces cache and possibly cycles it.
# This cache is quite resilient so it doesn't need to happen very often, but the
# operation is also inexpensive.
# Default is 10 seconds.
# It Does not apply to the "legacy" type of cache.
# SizeCheckInterval: "10s"