diff --git a/README.md b/README.md index d5232a87a93..fe092b58dbf 100644 --- a/README.md +++ b/README.md @@ -157,6 +157,8 @@ additional details on each available environment variable. | `ECS_UPDATES_ENABLED` | <true | false> | Whether to exit for an updater to apply updates when requested. | false | false | | `ECS_UPDATE_DOWNLOAD_DIR` | /cache | Where to place update tarballs within the container. | | | | `ECS_DISABLE_METRICS` | <true | false> | Whether to disable metrics gathering for tasks. | false | true | +| `ECS_POLL_METRICS` | <true | false> | Whether to poll or stream when gathering metrics for tasks. | false | false | +| `ECS_POLLING_METRICS_WAIT_DURATION` | 30s | Time to wait to poll for new metrics for a task. Only used when ECS_POLL_METRICS is true | 15s | 15s | | `ECS_RESERVED_MEMORY` | 32 | Memory, in MB, to reserve for use by things other than containers managed by Amazon ECS. | 0 | 0 | | `ECS_AVAILABLE_LOGGING_DRIVERS` | `["awslogs","fluentd","gelf","json-file","journald","logentries","splunk","syslog"]` | Which logging drivers are available on the container instance. | `["json-file","none"]` | `["json-file","none"]` | | `ECS_DISABLE_PRIVILEGED` | `true` | Whether launching privileged containers is disabled on the container instance. | `false` | `false` | diff --git a/agent/config/config.go b/agent/config/config.go index c629886bcc7..23094e23882 100644 --- a/agent/config/config.go +++ b/agent/config/config.go @@ -58,6 +58,10 @@ const ( // clean up task's containers. DefaultTaskCleanupWaitDuration = 3 * time.Hour + // DefaultPollingMetricsWaitDuration specifies the default value for polling metrics wait duration + // This is only used when PollMetrics is set to true + DefaultPollingMetricsWaitDuration = 15 * time.Second + // defaultDockerStopTimeout specifies the value for container stop timeout duration defaultDockerStopTimeout = 30 * time.Second @@ -77,6 +81,14 @@ const ( // a task's container. This is used to enforce sane values for the config.TaskCleanupWaitDuration field. minimumTaskCleanupWaitDuration = 1 * time.Minute + // minimumPollingMetricsWaitDuration specifies the minimum duration to wait before polling for new stats + // from docker. This is only used when PollMetrics is set to true + minimumPollingMetricsWaitDuration = 1 * time.Second + + // maximumPollingMetricsWaitDuration specifies the maximum duration to wait before polling for new stats + // from docker. This is only used when PollMetrics is set to true + maximumPollingMetricsWaitDuration = 20 * time.Second + // minimumDockerStopTimeout specifies the minimum value for docker StopContainer API minimumDockerStopTimeout = 1 * time.Second @@ -271,6 +283,20 @@ func (cfg *Config) validateAndOverrideBounds() error { cfg.TaskMetadataBurstRate = DefaultTaskMetadataBurstRate } + // check the PollMetrics specific configurations + if cfg.PollMetrics { + + if cfg.PollingMetricsWaitDuration < minimumPollingMetricsWaitDuration { + seelog.Warnf("Invalid value for polling metrics wait duration, will be overridden with the default value: %s. Parsed value: %v, minimum value: %v.", DefaultPollingMetricsWaitDuration.String(), cfg.PollingMetricsWaitDuration, minimumPollingMetricsWaitDuration) + cfg.PollingMetricsWaitDuration = DefaultPollingMetricsWaitDuration + } + + if cfg.PollingMetricsWaitDuration > maximumPollingMetricsWaitDuration { + seelog.Warnf("Invalid value for polling metrics wait duration, will be overridden with the default value: %s. Parsed value: %v, maximum value: %v.", DefaultPollingMetricsWaitDuration.String(), cfg.PollingMetricsWaitDuration, maximumPollingMetricsWaitDuration) + cfg.PollingMetricsWaitDuration = DefaultPollingMetricsWaitDuration + } + } + cfg.platformOverrides() return nil @@ -391,6 +417,8 @@ func environmentConfig() (Config, error) { UpdatesEnabled: utils.ParseBool(os.Getenv("ECS_UPDATES_ENABLED"), false), UpdateDownloadDir: os.Getenv("ECS_UPDATE_DOWNLOAD_DIR"), DisableMetrics: utils.ParseBool(os.Getenv("ECS_DISABLE_METRICS"), false), + PollMetrics: utils.ParseBool(os.Getenv("ECS_POLL_METRICS"), false), + PollingMetricsWaitDuration: parseEnvVariableDuration("ECS_POLLING_METRICS_WAIT_DURATION"), ReservedMemory: parseEnvVariableUint16("ECS_RESERVED_MEMORY"), AvailableLoggingDrivers: parseAvailableLoggingDrivers(), PrivilegedDisabled: utils.ParseBool(os.Getenv("ECS_DISABLE_PRIVILEGED"), false), @@ -443,6 +471,8 @@ func (cfg *Config) String() string { "AuthType: %v, "+ "UpdatesEnabled: %v, "+ "DisableMetrics: %v, "+ + "PollMetrics: %v, "+ + "PollingMetricsWaitDuration: %v, "+ "ReservedMem: %v, "+ "TaskCleanupWaitDuration: %v, "+ "DockerStopTimeout: %v, "+ @@ -456,6 +486,8 @@ func (cfg *Config) String() string { cfg.EngineAuthType, cfg.UpdatesEnabled, cfg.DisableMetrics, + cfg.PollMetrics, + cfg.PollingMetricsWaitDuration, cfg.ReservedMemory, cfg.TaskCleanupWaitDuration, cfg.DockerStopTimeout, diff --git a/agent/config/config_test.go b/agent/config/config_test.go index a0cf862c0e6..67635f377cb 100644 --- a/agent/config/config_test.go +++ b/agent/config/config_test.go @@ -87,6 +87,8 @@ func TestEnvironmentConfig(t *testing.T) { defer setTestEnv("ECS_INSTANCE_ATTRIBUTES", "{\"my_attribute\": \"testing\"}")() defer setTestEnv("ECS_ENABLE_TASK_ENI", "true")() defer setTestEnv("ECS_TASK_METADATA_RPS_LIMIT", "1000,1100")() + defer setTestEnv("ECS_POLL_METRICS", "true")() + defer setTestEnv("ECS_POLLING_METRICS_WAIT_DURATION", "10s")() additionalLocalRoutesJSON := `["1.2.3.4/22","5.6.7.8/32"]` setTestEnv("ECS_AWSVPC_ADDITIONAL_LOCAL_ROUTES", additionalLocalRoutesJSON) setTestEnv("ECS_ENABLE_CONTAINER_METADATA", "true") @@ -110,6 +112,9 @@ func TestEnvironmentConfig(t *testing.T) { assert.True(t, conf.TaskIAMRoleEnabled, "Wrong value for TaskIAMRoleEnabled") assert.True(t, conf.TaskIAMRoleEnabledForNetworkHost, "Wrong value for TaskIAMRoleEnabledForNetworkHost") assert.True(t, conf.ImageCleanupDisabled, "Wrong value for ImageCleanupDisabled") + assert.True(t, conf.PollMetrics, "Wrong value for PollMetrics") + expectedDurationPollingMetricsWaitDuration, _ := time.ParseDuration("10s") + assert.Equal(t, expectedDurationPollingMetricsWaitDuration, conf.PollingMetricsWaitDuration) assert.True(t, conf.TaskENIEnabled, "Wrong value for TaskNetwork") assert.Equal(t, (30 * time.Minute), conf.MinimumImageDeletionAge) @@ -179,6 +184,12 @@ func TestInvalidLoggingDriver(t *testing.T) { assert.Error(t, conf.validateAndOverrideBounds(), "Should be error with invalid-logging-driver") } +func TestDefaultPollMetricsWithoutECSDataDir(t *testing.T) { + conf, err := environmentConfig() + assert.NoError(t, err) + assert.False(t, conf.PollMetrics) +} + func TestDefaultCheckpointWithoutECSDataDir(t *testing.T) { conf, err := environmentConfig() assert.NoError(t, err) @@ -259,6 +270,26 @@ func TestInvalidValueContainerStartTimeout(t *testing.T) { assert.Equal(t, conf.ContainerStartTimeout, minimumContainerStartTimeout, "Wrong value for ContainerStartTimeout") } +func TestInvalidValueMaxPollingMetricsWaitDuration(t *testing.T) { + defer setTestRegion()() + defer setTestEnv("ECS_POLLING_METRICS_WAIT_DURATION", "21s")() + ctrl := gomock.NewController(t) + mockEc2Metadata := mock_ec2.NewMockEC2MetadataClient(ctrl) + conf, err := NewConfig(mockEc2Metadata) + assert.NoError(t, err) + assert.Equal(t, conf.PollingMetricsWaitDuration, maximumPollingMetricsWaitDuration, "Wrong value for PollingMetricsWaitDuration") +} + +func TestInvalidValueMinPollingMetricsWaitDuration(t *testing.T) { + defer setTestRegion()() + defer setTestEnv("ECS_POLLING_METRICS_WAIT_DURATION", "0s")() + ctrl := gomock.NewController(t) + mockEc2Metadata := mock_ec2.NewMockEC2MetadataClient(ctrl) + conf, err := NewConfig(mockEc2Metadata) + assert.NoError(t, err) + assert.Equal(t, conf.PollingMetricsWaitDuration, minimumPollingMetricsWaitDuration, "Wrong value for PollingMetricsWaitDuration") +} + func TestInvalidFormatParseEnvVariableUint16(t *testing.T) { defer setTestRegion()() setTestEnv("FOO", "foo") diff --git a/agent/config/config_unix.go b/agent/config/config_unix.go index 6d709285401..fef033d9897 100644 --- a/agent/config/config_unix.go +++ b/agent/config/config_unix.go @@ -49,6 +49,8 @@ func DefaultConfig() Config { DisableMetrics: false, ReservedMemory: 0, AvailableLoggingDrivers: []dockerclient.LoggingDriver{dockerclient.JSONFileDriver, dockerclient.NoneDriver}, + PollMetrics: false, + PollingMetricsWaitDuration: DefaultPollingMetricsWaitDuration, TaskCleanupWaitDuration: DefaultTaskCleanupWaitDuration, DockerStopTimeout: defaultDockerStopTimeout, ContainerStartTimeout: defaultContainerStartTimeout, diff --git a/agent/config/config_windows.go b/agent/config/config_windows.go index b48d94ea46a..a78859fe9a8 100644 --- a/agent/config/config_windows.go +++ b/agent/config/config_windows.go @@ -78,6 +78,8 @@ func DefaultConfig() Config { ReservedMemory: 0, AvailableLoggingDrivers: []dockerclient.LoggingDriver{dockerclient.JSONFileDriver, dockerclient.NoneDriver, dockerclient.AWSLogsDriver}, TaskCleanupWaitDuration: DefaultTaskCleanupWaitDuration, + PollMetrics: false, + PollingMetricsWaitDuration: DefaultPollingMetricsWaitDuration, DockerStopTimeout: defaultDockerStopTimeout, ContainerStartTimeout: defaultContainerStartTimeout, CredentialsAuditLogFile: filepath.Join(ecsRoot, defaultCredentialsAuditLogFile), diff --git a/agent/config/types.go b/agent/config/types.go index 5d790b5b694..b4fd1be0203 100644 --- a/agent/config/types.go +++ b/agent/config/types.go @@ -85,6 +85,14 @@ type Config struct { // sent to the ECS telemetry endpoint DisableMetrics bool + // PollMetrics configures whether metrics are constantly streamed for each container or + // polled on interval instead. + PollMetrics bool + + // PollingMetricsWaitDuration configures how long a container should wait before polling metrics + // again when PollMetrics is set to true + PollingMetricsWaitDuration time.Duration + // ReservedMemory specifies the amount of memory (in MB) to reserve for things // other than containers managed by ECS ReservedMemory uint16 diff --git a/agent/dockerclient/dockerapi/docker_client.go b/agent/dockerclient/dockerapi/docker_client.go index c5caa838872..6d8255d384c 100644 --- a/agent/dockerclient/dockerapi/docker_client.go +++ b/agent/dockerclient/dockerapi/docker_client.go @@ -20,6 +20,7 @@ import ( "encoding/json" "fmt" "io" + "math/rand" "strings" "sync" "time" @@ -1244,24 +1245,52 @@ func (dg *dockerGoClient) Stats(id string, ctx context.Context) (<-chan *docker. return nil, err } - stats := make(chan *docker.Stats) - options := docker.StatsOptions{ - ID: id, - Stats: stats, - Stream: true, - Context: ctx, - InactivityTimeout: StatsInactivityTimeout, - } + returnStats := make(chan *docker.Stats) - go func() { - statsErr := client.Stats(options) - if statsErr != nil { - seelog.Infof("DockerGoClient: Unable to retrieve stats for container %s: %v", - id, statsErr) + if !dg.config.PollMetrics { + seelog.Infof("DockerGoClient: Starting to Stream for metrics") + options := docker.StatsOptions{ + ID: id, + Stats: returnStats, + Stream: true, + Context: ctx, + InactivityTimeout: StatsInactivityTimeout, } - }() + + go func() { + statsErr := client.Stats(options) + if statsErr != nil { + seelog.Infof("DockerGoClient: Unable to retrieve stats for container %s: %v", + id, statsErr) + } + }() + } else { + seelog.Infof("DockerGoClient: Starting to Poll for metrics") + //Sleep for a random period time up to the polling interval. This will help make containers ask for stats at different times + time.Sleep(time.Second * time.Duration(rand.Intn(int(dg.config.PollingMetricsWaitDuration.Seconds())))) + + statPollTicker := time.NewTicker(dg.config.PollingMetricsWaitDuration) + go func() { + for range statPollTicker.C { + stats := make(chan *docker.Stats, 1) + options := docker.StatsOptions{ + ID: id, + Stats: stats, + Stream: false, + Context: ctx, + InactivityTimeout: StatsInactivityTimeout, + } + client.Stats(options) + + dockerStats, ok := <-stats + if ok { + returnStats <- dockerStats + } + } + }() + } - return stats, nil + return returnStats, nil } // RemoveImage invokes github.com/fsouza/go-dockerclient.Client's