Skip to content

Commit

Permalink
Prometheus metrics: optionally export total CPU instead of per-CPU
Browse files Browse the repository at this point in the history
Per-CPU stats are more expensive to transport and store, and that
level of detail is not required in many cases.

We export overall total cpu in the same metric as per-cpu, so that
dashboards which previously summed over cpu will work identically.
  • Loading branch information
bboreham committed Feb 20, 2018
1 parent 6116f26 commit ec6da3a
Show file tree
Hide file tree
Showing 6 changed files with 24 additions and 11 deletions.
3 changes: 2 additions & 1 deletion cadvisor.go
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,7 @@ var (
container.NetworkUsageMetrics: struct{}{},
container.NetworkTcpUsageMetrics: struct{}{},
container.NetworkUdpUsageMetrics: struct{}{},
container.PerCpuUsageMetrics: struct{}{},
}
)

Expand Down Expand Up @@ -103,7 +104,7 @@ func (ml *metricSetValue) Set(value string) error {
}

func init() {
flag.Var(&ignoreMetrics, "disable_metrics", "comma-separated list of `metrics` to be disabled. Options are 'disk', 'network', 'tcp', 'udp'. Note: tcp and udp are disabled by default due to high CPU usage.")
flag.Var(&ignoreMetrics, "disable_metrics", "comma-separated list of `metrics` to be disabled. Options are 'disk', 'network', 'tcp', 'udp', 'percpu'. Note: tcp and udp are disabled by default due to high CPU usage.")

// Default logging verbosity to V(2)
flag.Set("v", "2")
Expand Down
1 change: 1 addition & 0 deletions container/factory.go
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ type MetricKind string

const (
CpuUsageMetrics MetricKind = "cpu"
PerCpuUsageMetrics MetricKind = "percpu"
MemoryUsageMetrics MetricKind = "memory"
CpuLoadMetrics MetricKind = "cpuLoad"
DiskIOMetrics MetricKind = "diskIO"
Expand Down
15 changes: 9 additions & 6 deletions container/libcontainer/helpers.go
Original file line number Diff line number Diff line change
Expand Up @@ -113,7 +113,8 @@ func GetStats(cgroupManager cgroups.Manager, rootFs string, pid int, ignoreMetri
libcontainerStats := &libcontainer.Stats{
CgroupStats: cgroupStats,
}
stats := newContainerStats(libcontainerStats)
withPerCPU := !ignoreMetrics.Has(container.PerCpuUsageMetrics)
stats := newContainerStats(libcontainerStats, withPerCPU)

// If we know the pid then get network stats from /proc/<pid>/net/dev
if pid == 0 {
Expand Down Expand Up @@ -467,14 +468,17 @@ func minUint32(x, y uint32) uint32 {
var numCpusFunc = getNumberOnlineCPUs

// Convert libcontainer stats to info.ContainerStats.
func setCpuStats(s *cgroups.Stats, ret *info.ContainerStats) {
func setCpuStats(s *cgroups.Stats, ret *info.ContainerStats, withPerCPU bool) {
ret.Cpu.Usage.User = s.CpuStats.CpuUsage.UsageInUsermode
ret.Cpu.Usage.System = s.CpuStats.CpuUsage.UsageInKernelmode
ret.Cpu.Usage.Total = 0
ret.Cpu.Usage.Total = s.CpuStats.CpuUsage.TotalUsage
ret.Cpu.CFS.Periods = s.CpuStats.ThrottlingData.Periods
ret.Cpu.CFS.ThrottledPeriods = s.CpuStats.ThrottlingData.ThrottledPeriods
ret.Cpu.CFS.ThrottledTime = s.CpuStats.ThrottlingData.ThrottledTime

if !withPerCPU {
return
}
if len(s.CpuStats.CpuUsage.PercpuUsage) == 0 {
// libcontainer's 'GetStats' can leave 'PercpuUsage' nil if it skipped the
// cpuacct subsystem.
Expand All @@ -501,7 +505,6 @@ func setCpuStats(s *cgroups.Stats, ret *info.ContainerStats) {

for i := uint32(0); i < numActual; i++ {
ret.Cpu.Usage.PerCpu[i] = s.CpuStats.CpuUsage.PercpuUsage[i]
ret.Cpu.Usage.Total += s.CpuStats.CpuUsage.PercpuUsage[i]
}

}
Expand Down Expand Up @@ -587,13 +590,13 @@ func setNetworkStats(libcontainerStats *libcontainer.Stats, ret *info.ContainerS
}
}

func newContainerStats(libcontainerStats *libcontainer.Stats) *info.ContainerStats {
func newContainerStats(libcontainerStats *libcontainer.Stats, withPerCPU bool) *info.ContainerStats {
ret := &info.ContainerStats{
Timestamp: time.Now(),
}

if s := libcontainerStats.CgroupStats; s != nil {
setCpuStats(s, ret)
setCpuStats(s, ret, withPerCPU)
setDiskIoStats(s, ret)
setMemoryStats(s, ret)
}
Expand Down
4 changes: 2 additions & 2 deletions container/libcontainer/helpers_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -122,15 +122,15 @@ func TestMorePossibleCPUs(t *testing.T) {
},
}
var ret info.ContainerStats
setCpuStats(s, &ret)
setCpuStats(s, &ret, true)

expected := info.ContainerStats{
Cpu: info.CpuStats{
Usage: info.CpuUsage{
PerCpu: perCpuUsage[0:realNumCPUs],
User: s.CpuStats.CpuUsage.UsageInUsermode,
System: s.CpuStats.CpuUsage.UsageInKernelmode,
Total: 8562955455524 * uint64(realNumCPUs),
Total: 33802947350272,
},
},
}
Expand Down
10 changes: 9 additions & 1 deletion metrics/prometheus.go
Original file line number Diff line number Diff line change
Expand Up @@ -150,10 +150,18 @@ func NewPrometheusCollector(i infoProvider, f ContainerLabelsFunc) *PrometheusCo
},
}, {
name: "container_cpu_usage_seconds_total",
help: "Cumulative cpu time consumed per cpu in seconds.",
help: "Cumulative cpu time consumed in seconds.",
valueType: prometheus.CounterValue,
extraLabels: []string{"cpu"},
getValues: func(s *info.ContainerStats) metricValues {
if len(s.Cpu.Usage.PerCpu) == 0 {
if s.Cpu.Usage.Total > 0 {
return metricValues{{
value: float64(s.Cpu.Usage.Total) / float64(time.Second),
labels: []string{"total"},
}}
}
}
values := make(metricValues, 0, len(s.Cpu.Usage.PerCpu))
for i, value := range s.Cpu.Usage.PerCpu {
if value > 0 {
Expand Down
2 changes: 1 addition & 1 deletion metrics/testdata/prometheus_metrics
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ container_cpu_load_average_10s{container_env_foo_env="prod",container_label_foo_
# HELP container_cpu_system_seconds_total Cumulative system cpu time consumed in seconds.
# TYPE container_cpu_system_seconds_total counter
container_cpu_system_seconds_total{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",name="testcontaineralias",zone_name="hello"} 7e-09
# HELP container_cpu_usage_seconds_total Cumulative cpu time consumed per cpu in seconds.
# HELP container_cpu_usage_seconds_total Cumulative cpu time consumed in seconds.
# TYPE container_cpu_usage_seconds_total counter
container_cpu_usage_seconds_total{container_env_foo_env="prod",container_label_foo_label="bar",cpu="cpu00",id="testcontainer",image="test",name="testcontaineralias",zone_name="hello"} 2e-09
container_cpu_usage_seconds_total{container_env_foo_env="prod",container_label_foo_label="bar",cpu="cpu01",id="testcontainer",image="test",name="testcontaineralias",zone_name="hello"} 3e-09
Expand Down

0 comments on commit ec6da3a

Please sign in to comment.