Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Added ability to poll for container stats instead of constant stream #1475

Closed
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -157,6 +157,8 @@ additional details on each available environment variable.
| `ECS_UPDATES_ENABLED` | <true | false> | Whether to exit for an updater to apply updates when requested. | false | false |
| `ECS_UPDATE_DOWNLOAD_DIR` | /cache | Where to place update tarballs within the container. | | |
| `ECS_DISABLE_METRICS` | <true | false> | Whether to disable metrics gathering for tasks. | false | true |
| `ECS_POLL_METRICS` | <true | false> | Whether to poll or stream when gathering metrics for tasks. | false | false |
| `ECS_POLLING_METRICS_WAIT_DURATION` | 30s | Time to wait to poll for new metrics for a task. Only used when ECS_POLL_METRICS is true | 15s | 15s |
| `ECS_RESERVED_MEMORY` | 32 | Memory, in MB, to reserve for use by things other than containers managed by Amazon ECS. | 0 | 0 |
| `ECS_AVAILABLE_LOGGING_DRIVERS` | `["awslogs","fluentd","gelf","json-file","journald","logentries","splunk","syslog"]` | Which logging drivers are available on the container instance. | `["json-file","none"]` | `["json-file","none"]` |
| `ECS_DISABLE_PRIVILEGED` | `true` | Whether launching privileged containers is disabled on the container instance. | `false` | `false` |
Expand Down
32 changes: 32 additions & 0 deletions agent/config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,10 @@ const (
// clean up task's containers.
DefaultTaskCleanupWaitDuration = 3 * time.Hour

// DefaultPollingMetricsWaitDuration specifies the default value for polling metrics wait duration
// This is only used when PollMetrics is set to true
DefaultPollingMetricsWaitDuration = 15 * time.Second

// defaultDockerStopTimeout specifies the value for container stop timeout duration
defaultDockerStopTimeout = 30 * time.Second

Expand All @@ -77,6 +81,14 @@ const (
// a task's container. This is used to enforce sane values for the config.TaskCleanupWaitDuration field.
minimumTaskCleanupWaitDuration = 1 * time.Minute

// minimumPollingMetricsWaitDuration specifies the minimum duration to wait before polling for new stats
// from docker. This is only used when PollMetrics is set to true
minimumPollingMetricsWaitDuration = 1 * time.Second

// maximumPollingMetricsWaitDuration specifies the maximum duration to wait before polling for new stats
// from docker. This is only used when PollMetrics is set to true
maximumPollingMetricsWaitDuration = 20 * time.Second

// minimumDockerStopTimeout specifies the minimum value for docker StopContainer API
minimumDockerStopTimeout = 1 * time.Second

Expand Down Expand Up @@ -271,6 +283,20 @@ func (cfg *Config) validateAndOverrideBounds() error {
cfg.TaskMetadataBurstRate = DefaultTaskMetadataBurstRate
}

// check the PollMetrics specific configurations
if cfg.PollMetrics {

if cfg.PollingMetricsWaitDuration < minimumPollingMetricsWaitDuration {
seelog.Warnf("Invalid value for polling metrics wait duration, will be overridden with the default value: %s. Parsed value: %v, minimum value: %v.", DefaultPollingMetricsWaitDuration.String(), cfg.PollingMetricsWaitDuration, minimumPollingMetricsWaitDuration)
cfg.PollingMetricsWaitDuration = DefaultPollingMetricsWaitDuration
}

if cfg.PollingMetricsWaitDuration > maximumPollingMetricsWaitDuration {
seelog.Warnf("Invalid value for polling metrics wait duration, will be overridden with the default value: %s. Parsed value: %v, maximum value: %v.", DefaultPollingMetricsWaitDuration.String(), cfg.PollingMetricsWaitDuration, maximumPollingMetricsWaitDuration)
cfg.PollingMetricsWaitDuration = DefaultPollingMetricsWaitDuration
}
}

cfg.platformOverrides()

return nil
Expand Down Expand Up @@ -391,6 +417,8 @@ func environmentConfig() (Config, error) {
UpdatesEnabled: utils.ParseBool(os.Getenv("ECS_UPDATES_ENABLED"), false),
UpdateDownloadDir: os.Getenv("ECS_UPDATE_DOWNLOAD_DIR"),
DisableMetrics: utils.ParseBool(os.Getenv("ECS_DISABLE_METRICS"), false),
PollMetrics: utils.ParseBool(os.Getenv("ECS_POLL_METRICS"), false),
PollingMetricsWaitDuration: parseEnvVariableDuration("ECS_POLLING_METRICS_WAIT_DURATION"),
ReservedMemory: parseEnvVariableUint16("ECS_RESERVED_MEMORY"),
AvailableLoggingDrivers: parseAvailableLoggingDrivers(),
PrivilegedDisabled: utils.ParseBool(os.Getenv("ECS_DISABLE_PRIVILEGED"), false),
Expand Down Expand Up @@ -443,6 +471,8 @@ func (cfg *Config) String() string {
"AuthType: %v, "+
"UpdatesEnabled: %v, "+
"DisableMetrics: %v, "+
"PollMetrics: %v, "+
"PollingMetricsWaitDuration: %v, "+
"ReservedMem: %v, "+
"TaskCleanupWaitDuration: %v, "+
"DockerStopTimeout: %v, "+
Expand All @@ -456,6 +486,8 @@ func (cfg *Config) String() string {
cfg.EngineAuthType,
cfg.UpdatesEnabled,
cfg.DisableMetrics,
cfg.PollMetrics,
cfg.PollingMetricsWaitDuration,
cfg.ReservedMemory,
cfg.TaskCleanupWaitDuration,
cfg.DockerStopTimeout,
Expand Down
31 changes: 31 additions & 0 deletions agent/config/config_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,8 @@ func TestEnvironmentConfig(t *testing.T) {
defer setTestEnv("ECS_INSTANCE_ATTRIBUTES", "{\"my_attribute\": \"testing\"}")()
defer setTestEnv("ECS_ENABLE_TASK_ENI", "true")()
defer setTestEnv("ECS_TASK_METADATA_RPS_LIMIT", "1000,1100")()
defer setTestEnv("ECS_POLL_METRICS", "true")()
defer setTestEnv("ECS_POLLING_METRICS_WAIT_DURATION", "10s")()
additionalLocalRoutesJSON := `["1.2.3.4/22","5.6.7.8/32"]`
setTestEnv("ECS_AWSVPC_ADDITIONAL_LOCAL_ROUTES", additionalLocalRoutesJSON)
setTestEnv("ECS_ENABLE_CONTAINER_METADATA", "true")
Expand All @@ -110,6 +112,9 @@ func TestEnvironmentConfig(t *testing.T) {
assert.True(t, conf.TaskIAMRoleEnabled, "Wrong value for TaskIAMRoleEnabled")
assert.True(t, conf.TaskIAMRoleEnabledForNetworkHost, "Wrong value for TaskIAMRoleEnabledForNetworkHost")
assert.True(t, conf.ImageCleanupDisabled, "Wrong value for ImageCleanupDisabled")
assert.True(t, conf.PollMetrics, "Wrong value for PollMetrics")
expectedDurationPollingMetricsWaitDuration, _ := time.ParseDuration("10s")
assert.Equal(t, expectedDurationPollingMetricsWaitDuration, conf.PollingMetricsWaitDuration)

assert.True(t, conf.TaskENIEnabled, "Wrong value for TaskNetwork")
assert.Equal(t, (30 * time.Minute), conf.MinimumImageDeletionAge)
Expand Down Expand Up @@ -179,6 +184,12 @@ func TestInvalidLoggingDriver(t *testing.T) {
assert.Error(t, conf.validateAndOverrideBounds(), "Should be error with invalid-logging-driver")
}

func TestDefaultPollMetricsWithoutECSDataDir(t *testing.T) {
conf, err := environmentConfig()
assert.NoError(t, err)
assert.False(t, conf.PollMetrics)
}

func TestDefaultCheckpointWithoutECSDataDir(t *testing.T) {
conf, err := environmentConfig()
assert.NoError(t, err)
Expand Down Expand Up @@ -259,6 +270,26 @@ func TestInvalidValueContainerStartTimeout(t *testing.T) {
assert.Equal(t, conf.ContainerStartTimeout, minimumContainerStartTimeout, "Wrong value for ContainerStartTimeout")
}

func TestInvalidValueMaxPollingMetricsWaitDuration(t *testing.T) {
defer setTestRegion()()
defer setTestEnv("ECS_POLLING_METRICS_WAIT_DURATION", "21s")()
ctrl := gomock.NewController(t)
mockEc2Metadata := mock_ec2.NewMockEC2MetadataClient(ctrl)
conf, err := NewConfig(mockEc2Metadata)
assert.NoError(t, err)
assert.Equal(t, conf.PollingMetricsWaitDuration, maximumPollingMetricsWaitDuration, "Wrong value for PollingMetricsWaitDuration")
}

func TestInvalidValueMinPollingMetricsWaitDuration(t *testing.T) {
defer setTestRegion()()
defer setTestEnv("ECS_POLLING_METRICS_WAIT_DURATION", "0s")()
ctrl := gomock.NewController(t)
mockEc2Metadata := mock_ec2.NewMockEC2MetadataClient(ctrl)
conf, err := NewConfig(mockEc2Metadata)
assert.NoError(t, err)
assert.Equal(t, conf.PollingMetricsWaitDuration, minimumPollingMetricsWaitDuration, "Wrong value for PollingMetricsWaitDuration")
}

func TestInvalidFormatParseEnvVariableUint16(t *testing.T) {
defer setTestRegion()()
setTestEnv("FOO", "foo")
Expand Down
2 changes: 2 additions & 0 deletions agent/config/config_unix.go
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,8 @@ func DefaultConfig() Config {
DisableMetrics: false,
ReservedMemory: 0,
AvailableLoggingDrivers: []dockerclient.LoggingDriver{dockerclient.JSONFileDriver, dockerclient.NoneDriver},
PollMetrics: false,
PollingMetricsWaitDuration: DefaultPollingMetricsWaitDuration,
TaskCleanupWaitDuration: DefaultTaskCleanupWaitDuration,
DockerStopTimeout: defaultDockerStopTimeout,
ContainerStartTimeout: defaultContainerStartTimeout,
Expand Down
2 changes: 2 additions & 0 deletions agent/config/config_windows.go
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,8 @@ func DefaultConfig() Config {
ReservedMemory: 0,
AvailableLoggingDrivers: []dockerclient.LoggingDriver{dockerclient.JSONFileDriver, dockerclient.NoneDriver, dockerclient.AWSLogsDriver},
TaskCleanupWaitDuration: DefaultTaskCleanupWaitDuration,
PollMetrics: false,
PollingMetricsWaitDuration: DefaultPollingMetricsWaitDuration,
DockerStopTimeout: defaultDockerStopTimeout,
ContainerStartTimeout: defaultContainerStartTimeout,
CredentialsAuditLogFile: filepath.Join(ecsRoot, defaultCredentialsAuditLogFile),
Expand Down
8 changes: 8 additions & 0 deletions agent/config/types.go
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,14 @@ type Config struct {
// sent to the ECS telemetry endpoint
DisableMetrics bool

// PollMetrics configures whether metrics are constantly streamed for each container or
// polled on interval instead.
PollMetrics bool

// PollingMetricsWaitDuration configures how long a container should wait before polling metrics
// again when PollMetrics is set to true
PollingMetricsWaitDuration time.Duration

// ReservedMemory specifies the amount of memory (in MB) to reserve for things
// other than containers managed by ECS
ReservedMemory uint16
Expand Down
59 changes: 44 additions & 15 deletions agent/dockerclient/dockerapi/docker_client.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ import (
"encoding/json"
"fmt"
"io"
"math/rand"
"strings"
"sync"
"time"
Expand Down Expand Up @@ -1244,24 +1245,52 @@ func (dg *dockerGoClient) Stats(id string, ctx context.Context) (<-chan *docker.
return nil, err
}

stats := make(chan *docker.Stats)
options := docker.StatsOptions{
ID: id,
Stats: stats,
Stream: true,
Context: ctx,
InactivityTimeout: StatsInactivityTimeout,
}
returnStats := make(chan *docker.Stats)

go func() {
statsErr := client.Stats(options)
if statsErr != nil {
seelog.Infof("DockerGoClient: Unable to retrieve stats for container %s: %v",
id, statsErr)
if !dg.config.PollMetrics {
seelog.Infof("DockerGoClient: Starting to Stream for metrics")
options := docker.StatsOptions{
ID: id,
Stats: returnStats,
Stream: true,
Context: ctx,
InactivityTimeout: StatsInactivityTimeout,
}
}()

go func() {
statsErr := client.Stats(options)
if statsErr != nil {
seelog.Infof("DockerGoClient: Unable to retrieve stats for container %s: %v",
id, statsErr)
}
}()
} else {
seelog.Infof("DockerGoClient: Starting to Poll for metrics")
//Sleep for a random period time up to the polling interval. This will help make containers ask for stats at different times
time.Sleep(time.Second * time.Duration(rand.Intn(int(dg.config.PollingMetricsWaitDuration.Seconds()))))

statPollTicker := time.NewTicker(dg.config.PollingMetricsWaitDuration)
go func() {
for range statPollTicker.C {
stats := make(chan *docker.Stats, 1)
options := docker.StatsOptions{
ID: id,
Stats: stats,
Stream: false,
Context: ctx,
InactivityTimeout: StatsInactivityTimeout,
}
client.Stats(options)

dockerStats, ok := <-stats
if ok {
returnStats <- dockerStats
}
}
}()
}

return stats, nil
return returnStats, nil
}

// RemoveImage invokes github.com/fsouza/go-dockerclient.Client's
Expand Down