From efebdab3a7c82c847f94317ad83d21f8943b752e Mon Sep 17 00:00:00 2001 From: weeniearms Date: Mon, 4 Nov 2024 20:54:17 +0100 Subject: [PATCH] feat: allow disabling network and/or storage metric collection only Since not all metrics are likely to be of the same relevance, it would be beneficial from a cost perspective to allow disabling some of them, namely Network IO and Block IO. This change introduces the ability to disable network and storage stats collection via ECS_DISABLE_NETWORK_METRICS and ECS_DISABLE_STORAGE_METRICS respectively. --- README.md | 2 + agent/config/config.go | 6 ++ agent/config/config_test.go | 4 ++ agent/config/config_unix.go | 2 + agent/config/config_unix_test.go | 2 + agent/config/config_windows_test.go | 2 + agent/config/types.go | 8 +++ agent/stats/engine.go | 102 +++++++++++++++------------- agent/stats/engine_test.go | 31 ++++++--- agent/stats/engine_unix_test.go | 18 ++--- 10 files changed, 111 insertions(+), 66 deletions(-) diff --git a/README.md b/README.md index 6ae3cec3075..827fe9e1aae 100644 --- a/README.md +++ b/README.md @@ -186,6 +186,8 @@ additional details on each available environment variable. | `ECS_DATADIR` | /data/ | The container path where state is checkpointed for use across agent restarts. Note that on Linux, when you specify this, you will need to make sure that the Agent container has a bind mount of `$ECS_HOST_DATA_DIR/data:$ECS_DATADIR` with the corresponding values of `ECS_HOST_DATA_DIR` and `ECS_DATADIR`. | /data/ | `C:\ProgramData\Amazon\ECS\data` | `ECS_UPDATES_ENABLED` | <true | false> | Whether to exit for an updater to apply updates when requested. | false | false | | `ECS_DISABLE_METRICS` | <true | false> | Whether to disable metrics gathering for tasks. | false | false | +| `ECS_DISABLE_STORAGE_METRICS` | <true | false> | Whether to disable storage metrics gathering for tasks. | false | false | +| `ECS_DISABLE_NETWORK_METRICS` | <true | false> | Whether to disable network metrics gathering for tasks. | false | false | | `ECS_POLL_METRICS` | <true | false> | Whether to poll or stream when gathering metrics for tasks. Setting this value to `true` can help reduce the CPU usage of dockerd and containerd on the ECS container instance. See also ECS_POLL_METRICS_WAIT_DURATION for setting the poll interval. | `false` | `false` | | `ECS_POLLING_METRICS_WAIT_DURATION` | 10s | Time to wait between polling for metrics for a task. Not used when ECS_POLL_METRICS is false. Maximum value is 20s and minimum value is 5s. If user sets above maximum it will be set to max, and if below minimum it will be set to min. As the number of tasks/containers increase, a higher `ECS_POLLING_METRICS_WAIT_DURATION` value can potentially cause a problem where memory reservation value of ECS cluster reported in metrics becomes unstable due to missing metrics sample at metric collection time. It is recommended to keep this value smaller than 18s. This behavior is only observed on certain OS and platforms. | 10s | 10s | | `ECS_PULL_DEPENDENT_CONTAINERS_UPFRONT` | <true | false> | Whether to pull images for containers with dependencies before the dependsOn condition has been satisfied. | false | false | diff --git a/agent/config/config.go b/agent/config/config.go index ccbafa79370..15b51888a62 100644 --- a/agent/config/config.go +++ b/agent/config/config.go @@ -541,6 +541,8 @@ func environmentConfig() (Config, error) { UpdatesEnabled: parseBooleanDefaultFalseConfig("ECS_UPDATES_ENABLED"), UpdateDownloadDir: os.Getenv("ECS_UPDATE_DOWNLOAD_DIR"), DisableMetrics: parseBooleanDefaultFalseConfig("ECS_DISABLE_METRICS"), + DisableNetworkMetrics: parseBooleanDefaultFalseConfig("ECS_DISABLE_NETWORK_METRICS"), + DisableStorageMetrics: parseBooleanDefaultFalseConfig("ECS_DISABLE_STORAGE_METRICS"), ReservedMemory: parseEnvVariableUint16("ECS_RESERVED_MEMORY"), AvailableLoggingDrivers: parseAvailableLoggingDrivers(), PrivilegedDisabled: parseBooleanDefaultFalseConfig("ECS_DISABLE_PRIVILEGED"), @@ -626,6 +628,8 @@ func (cfg *Config) String() string { "AuthType: %v, "+ "UpdatesEnabled: %v, "+ "DisableMetrics: %v, "+ + "DisableNetworkMetrics: %v, "+ + "DisableStorageMetrics: %v, "+ "PollMetrics: %v, "+ "PollingMetricsWaitDuration: %v, "+ "ReservedMem: %v, "+ @@ -646,6 +650,8 @@ func (cfg *Config) String() string { cfg.EngineAuthType, cfg.UpdatesEnabled, cfg.DisableMetrics, + cfg.DisableNetworkMetrics, + cfg.DisableStorageMetrics, cfg.PollMetrics, cfg.PollingMetricsWaitDuration, cfg.ReservedMemory, diff --git a/agent/config/config_test.go b/agent/config/config_test.go index 4d0ee41ceea..ebe813c8d06 100644 --- a/agent/config/config_test.go +++ b/agent/config/config_test.go @@ -252,10 +252,14 @@ func TestConfigBoolean(t *testing.T) { defer setTestRegion()() defer setTestEnv("ECS_DISABLE_DOCKER_HEALTH_CHECK", "true")() defer setTestEnv("ECS_DISABLE_METRICS", "true")() + defer setTestEnv("ECS_DISABLE_NETWORK_METRICS", "true")() + defer setTestEnv("ECS_DISABLE_STORAGE_METRICS", "true")() defer setTestEnv("ECS_ENABLE_SPOT_INSTANCE_DRAINING", "true")() cfg, err := NewConfig(ec2.NewBlackholeEC2MetadataClient()) assert.NoError(t, err) assert.True(t, cfg.DisableMetrics.Enabled()) + assert.True(t, cfg.DisableNetworkMetrics.Enabled()) + assert.True(t, cfg.DisableStorageMetrics.Enabled()) assert.True(t, cfg.DisableDockerHealthCheck.Enabled()) assert.True(t, cfg.SpotInstanceDrainingEnabled.Enabled()) } diff --git a/agent/config/config_unix.go b/agent/config/config_unix.go index c8ac2ecf9b5..2b1d070e984 100644 --- a/agent/config/config_unix.go +++ b/agent/config/config_unix.go @@ -77,6 +77,8 @@ func DefaultConfig() Config { DataDir: "/data/", DataDirOnHost: "/var/lib/ecs", DisableMetrics: BooleanDefaultFalse{Value: ExplicitlyDisabled}, + DisableNetworkMetrics: BooleanDefaultFalse{Value: ExplicitlyDisabled}, + DisableStorageMetrics: BooleanDefaultFalse{Value: ExplicitlyDisabled}, ReservedMemory: 0, AvailableLoggingDrivers: []dockerclient.LoggingDriver{dockerclient.JSONFileDriver, dockerclient.NoneDriver}, TaskCleanupWaitDuration: DefaultTaskCleanupWaitDuration, diff --git a/agent/config/config_unix_test.go b/agent/config/config_unix_test.go index 56be9d3530f..a5362043fdb 100644 --- a/agent/config/config_unix_test.go +++ b/agent/config/config_unix_test.go @@ -41,6 +41,8 @@ func TestConfigDefault(t *testing.T) { assert.Equal(t, "unix:///var/run/docker.sock", cfg.DockerEndpoint, "Default docker endpoint set incorrectly") assert.Equal(t, "/data/", cfg.DataDir, "Default datadir set incorrectly") assert.False(t, cfg.DisableMetrics.Enabled(), "Default disablemetrics set incorrectly") + assert.False(t, cfg.DisableNetworkMetrics.Enabled(), "Default disablenetworkmetrics set incorrectly") + assert.False(t, cfg.DisableStorageMetrics.Enabled(), "Default disablestoragemetrics set incorrectly") assert.Equal(t, 5, len(cfg.ReservedPorts), "Default reserved ports set incorrectly") assert.Equal(t, uint16(0), cfg.ReservedMemory, "Default reserved memory set incorrectly") assert.Equal(t, 30*time.Second, cfg.DockerStopTimeout, "Default docker stop container timeout set incorrectly") diff --git a/agent/config/config_windows_test.go b/agent/config/config_windows_test.go index b67d15ceb13..e7547b995fb 100644 --- a/agent/config/config_windows_test.go +++ b/agent/config/config_windows_test.go @@ -43,6 +43,8 @@ func TestConfigDefault(t *testing.T) { assert.Equal(t, "npipe:////./pipe/docker_engine", cfg.DockerEndpoint, "Default docker endpoint set incorrectly") assert.Equal(t, `C:\ProgramData\Amazon\ECS\data`, cfg.DataDir, "Default datadir set incorrectly") assert.False(t, cfg.DisableMetrics.Enabled(), "Default disablemetrics set incorrectly") + assert.False(t, cfg.DisableStorageMetrics.Enabled(), "Default disablestoragemetrics set incorrectly") + assert.False(t, cfg.DisableNetworkMetrics.Enabled(), "Default disablenetworkmetrics set incorrectly") assert.Equal(t, 11, len(cfg.ReservedPorts), "Default reserved ports set incorrectly") assert.Equal(t, uint16(0), cfg.ReservedMemory, "Default reserved memory set incorrectly") assert.Equal(t, 30*time.Second, cfg.DockerStopTimeout, "Default docker stop container timeout set incorrectly") diff --git a/agent/config/types.go b/agent/config/types.go index c54f28c869d..536fee9db98 100644 --- a/agent/config/types.go +++ b/agent/config/types.go @@ -91,6 +91,14 @@ type Config struct { // sent to the ECS telemetry endpoint DisableMetrics BooleanDefaultFalse + // DisableNetworkMetrics configures whether task network IO utilization metrics should be + // sent to the ECS telemetry endpoint + DisableNetworkMetrics BooleanDefaultFalse + + // DisableStorageMetrics configures whether task block IO utilization metrics should be + // sent to the ECS telemetry endpoint + DisableStorageMetrics BooleanDefaultFalse + // PollMetrics configures whether metrics are constantly streamed for each container or // polled on interval instead. PollMetrics BooleanDefaultFalse diff --git a/agent/stats/engine.go b/agent/stats/engine.go index 10d2ab4f036..e0dab5c6512 100644 --- a/agent/stats/engine.go +++ b/agent/stats/engine.go @@ -864,14 +864,16 @@ func (engine *DockerStatsEngine) taskContainerMetricsUnsafe(taskArn string) ([]* MemoryStatsSet: memoryStatsSet, } - storageStatsSet, err := container.statsQueue.GetStorageStatsSet() - if err != nil && age > gracePeriod { - logger.Warn("Error getting storage stats for container", logger.Fields{ - field.Container: dockerID, - field.Error: err, - }) - } else { - containerMetric.StorageStatsSet = storageStatsSet + if !engine.config.DisableStorageMetrics.Enabled() { + storageStatsSet, err := container.statsQueue.GetStorageStatsSet() + if err != nil && age > gracePeriod { + logger.Warn("Error getting storage stats for container", logger.Fields{ + field.Container: dockerID, + field.Error: err, + }) + } else { + containerMetric.StorageStatsSet = storageStatsSet + } } restartStatsSet, err := container.statsQueue.GetRestartStatsSet() @@ -887,51 +889,53 @@ func (engine *DockerStatsEngine) taskContainerMetricsUnsafe(taskArn string) ([]* containerMetric.RestartStatsSet = restartStatsSet } - task, err := engine.resolver.ResolveTask(dockerID) - if err != nil { - logger.Warn("Task not found for container", logger.Fields{ - field.Container: dockerID, - field.Error: err, - }) - } else { - if dockerContainer, err := engine.resolver.ResolveContainer(dockerID); err != nil { - logger.Warn("Could not map container ID to container, container", logger.Fields{ - field.DockerId: dockerID, - field.Error: err, + if !engine.config.DisableNetworkMetrics.Enabled() { + task, err := engine.resolver.ResolveTask(dockerID) + if err != nil { + logger.Warn("Task not found for container", logger.Fields{ + field.Container: dockerID, + field.Error: err, }) } else { - // send network stats for default/bridge/nat/awsvpc network modes - if task.IsNetworkModeBridge() { - if task.IsServiceConnectEnabled() && dockerContainer.Container.Type == apicontainer.ContainerCNIPause { - seelog.Debug("Skip adding network stats for pause container in Service Connect enabled task") - } else { - networkStatsSet, err := container.statsQueue.GetNetworkStatsSet() - if err != nil && age > gracePeriod { - // we log the error and still continue to publish cpu, memory stats - logger.Warn("Error getting network stats for container", logger.Fields{ - field.Container: dockerID, - field.Error: err, - }) + if dockerContainer, err := engine.resolver.ResolveContainer(dockerID); err != nil { + logger.Warn("Could not map container ID to container, container", logger.Fields{ + field.DockerId: dockerID, + field.Error: err, + }) + } else { + // send network stats for default/bridge/nat/awsvpc network modes + if task.IsNetworkModeBridge() { + if task.IsServiceConnectEnabled() && dockerContainer.Container.Type == apicontainer.ContainerCNIPause { + seelog.Debug("Skip adding network stats for pause container in Service Connect enabled task") } else { - containerMetric.NetworkStatsSet = networkStatsSet + networkStatsSet, err := container.statsQueue.GetNetworkStatsSet() + if err != nil && age > gracePeriod { + // we log the error and still continue to publish cpu, memory stats + logger.Warn("Error getting network stats for container", logger.Fields{ + field.Container: dockerID, + field.Error: err, + }) + } else { + containerMetric.NetworkStatsSet = networkStatsSet + } } - } - } else if task.IsNetworkModeAWSVPC() { - taskStatsMap, taskExistsInTaskStats := engine.taskToTaskStats[taskArn] - if !taskExistsInTaskStats { - return nil, fmt.Errorf("task not found") - } - // do not add network stats for pause container - if dockerContainer.Container.Type != apicontainer.ContainerCNIPause { - networkStats, err := taskStatsMap.StatsQueue.GetNetworkStatsSet() - if err != nil && age > gracePeriod { - logger.Warn("Error getting network stats for container", logger.Fields{ - field.TaskARN: taskArn, - field.Container: dockerContainer.DockerID, - field.Error: err, - }) - } else { - containerMetric.NetworkStatsSet = networkStats + } else if task.IsNetworkModeAWSVPC() { + taskStatsMap, taskExistsInTaskStats := engine.taskToTaskStats[taskArn] + if !taskExistsInTaskStats { + return nil, fmt.Errorf("task not found") + } + // do not add network stats for pause container + if dockerContainer.Container.Type != apicontainer.ContainerCNIPause { + networkStats, err := taskStatsMap.StatsQueue.GetNetworkStatsSet() + if err != nil && age > gracePeriod { + logger.Warn("Error getting network stats for container", logger.Fields{ + field.TaskARN: taskArn, + field.Container: dockerContainer.DockerID, + field.Error: err, + }) + } else { + containerMetric.NetworkStatsSet = networkStats + } } } } diff --git a/agent/stats/engine_test.go b/agent/stats/engine_test.go index 5e34473298e..888f47d39c4 100644 --- a/agent/stats/engine_test.go +++ b/agent/stats/engine_test.go @@ -724,20 +724,22 @@ func TestSynchronizeOnRestart(t *testing.T) { func TestTaskNetworkStatsSet(t *testing.T) { var networkModes = []struct { - ENIs []*ni.NetworkInterface - NetworkMode string - ServiceConnectEnabled bool - StatsEmpty bool + ENIs []*ni.NetworkInterface + NetworkMode string + ServiceConnectEnabled bool + NetworkMetricsDisabled bool + StatsEmpty bool }{ - {nil, DefaultNetworkMode, false, false}, - {nil, DefaultNetworkMode, true, true}, + {nil, DefaultNetworkMode, false, false, false}, + {nil, DefaultNetworkMode, true, false, true}, + {nil, DefaultNetworkMode, false, true, true}, } for _, tc := range networkModes { - testNetworkModeStats(t, tc.NetworkMode, tc.ENIs, tc.ServiceConnectEnabled, tc.StatsEmpty) + testNetworkModeStats(t, tc.NetworkMode, tc.ENIs, tc.ServiceConnectEnabled, tc.NetworkMetricsDisabled, tc.StatsEmpty) } } -func testNetworkModeStats(t *testing.T, netMode string, enis []*ni.NetworkInterface, serviceConnectEnabled, emptyStats bool) { +func testNetworkModeStats(t *testing.T, netMode string, enis []*ni.NetworkInterface, serviceConnectEnabled, networkMetricsDisabled, emptyStats bool) { mockCtrl := gomock.NewController(t) defer mockCtrl.Finish() resolver := mock_resolver.NewMockContainerMetadataResolver(mockCtrl) @@ -782,7 +784,18 @@ func testNetworkModeStats(t *testing.T, netMode string, enis []*ni.NetworkInterf State: &types.ContainerState{Pid: 23}, }, }, nil).AnyTimes() - engine := NewDockerStatsEngine(&cfg, nil, eventStream("TestTaskNetworkStatsSet"), nil, nil, nil) + + var c *config.Config + if networkMetricsDisabled { + c = &config.Config{ + DisableNetworkMetrics: config.BooleanDefaultFalse{Value: config.ExplicitlyEnabled}, + } + c = c.Merge(cfg) + } else { + c = &cfg + } + + engine := NewDockerStatsEngine(c, nil, eventStream("TestTaskNetworkStatsSet"), nil, nil, nil) ctx, cancel := context.WithCancel(context.TODO()) defer cancel() engine.ctx = ctx diff --git a/agent/stats/engine_unix_test.go b/agent/stats/engine_unix_test.go index 2c1bb4ecf2d..9196c80bb08 100644 --- a/agent/stats/engine_unix_test.go +++ b/agent/stats/engine_unix_test.go @@ -43,17 +43,19 @@ const ( func TestLinuxTaskNetworkStatsSet(t *testing.T) { var networkModes = []struct { - ENIs []*ni.NetworkInterface - NetworkMode string - StatsEmpty bool + ENIs []*ni.NetworkInterface + NetworkMode string + NetworkMetricsDisabled bool + StatsEmpty bool }{ - {[]*ni.NetworkInterface{{ID: "ec2Id"}}, "awsvpc", true}, - {nil, "host", true}, - {nil, "bridge", false}, - {nil, "none", true}, + {[]*ni.NetworkInterface{{ID: "ec2Id"}}, "awsvpc", false, true}, + {nil, "host", false, true}, + {nil, "bridge", false, false}, + {nil, "bridge", true, true}, + {nil, "none", false, true}, } for _, tc := range networkModes { - testNetworkModeStats(t, tc.NetworkMode, tc.ENIs, false, tc.StatsEmpty) + testNetworkModeStats(t, tc.NetworkMode, tc.ENIs, false, tc.NetworkMetricsDisabled, tc.StatsEmpty) } }