Skip to content

Expose OOM event count to prometheus #2829

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Mar 15, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions cmd/cadvisor.go
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,7 @@ var (
container.CPUTopologyMetrics: struct{}{},
container.ResctrlMetrics: struct{}{},
container.CPUSetMetrics: struct{}{},
container.OOMMetrics: struct{}{},
}
)

Expand Down
1 change: 1 addition & 0 deletions cmd/cadvisor_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,7 @@ func TestToIncludedMetrics(t *testing.T) {
container.CPUTopologyMetrics: struct{}{},
container.ResctrlMetrics: struct{}{},
container.CPUSetMetrics: struct{}{},
container.OOMMetrics: struct{}{},
},
container.AllMetrics,
{},
Expand Down
2 changes: 2 additions & 0 deletions container/factory.go
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,7 @@ const (
CPUTopologyMetrics MetricKind = "cpu_topology"
ResctrlMetrics MetricKind = "resctrl"
CPUSetMetrics MetricKind = "cpuset"
OOMMetrics MetricKind = "oom_event"
)

// AllMetrics represents all kinds of metrics that cAdvisor supported.
Expand All @@ -89,6 +90,7 @@ var AllMetrics = MetricSet{
CPUTopologyMetrics: struct{}{},
ResctrlMetrics: struct{}{},
CPUSetMetrics: struct{}{},
OOMMetrics: struct{}{},
}

func (mk MetricKind) String() string {
Expand Down
2 changes: 2 additions & 0 deletions info/v1/container.go
Original file line number Diff line number Diff line change
Expand Up @@ -963,6 +963,8 @@ type ContainerStats struct {
Resctrl ResctrlStats `json:"resctrl,omitempty"`

CpuSet CPUSetStats `json:"cpuset,omitempty"`

OOMEvents uint64 `json:"oom_events,omitempty"`
}

func timeEq(t1, t2 time.Time, tolerance time.Duration) bool {
Expand Down
6 changes: 6 additions & 0 deletions manager/container.go
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ import (
"strconv"
"strings"
"sync"
"sync/atomic"
"time"

"github.com/google/cadvisor/cache/memory"
Expand Down Expand Up @@ -102,6 +103,8 @@ type containerData struct {

// resctrlCollector updates stats for resctrl controller.
resctrlCollector stats.Collector

oomEvents uint64
}

// jitter returns a time.Duration between duration and duration + maxFactor * duration,
Expand Down Expand Up @@ -668,6 +671,9 @@ func (cd *containerData) updateStats() error {
klog.V(2).Infof("Failed to add summary stats for %q: %v", cd.info.Name, err)
}
}

stats.OOMEvents = atomic.LoadUint64(&cd.oomEvents)

var customStatsErr error
cm := cd.collectorManager.(*collector.GenericCollectorManager)
if len(cm.Collectors) > 0 {
Expand Down
21 changes: 20 additions & 1 deletion manager/manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ import (
"strconv"
"strings"
"sync"
"sync/atomic"
"time"

"github.com/google/cadvisor/accelerators"
Expand All @@ -35,7 +36,7 @@ import (
"github.com/google/cadvisor/events"
"github.com/google/cadvisor/fs"
info "github.com/google/cadvisor/info/v1"
"github.com/google/cadvisor/info/v2"
v2 "github.com/google/cadvisor/info/v2"
"github.com/google/cadvisor/machine"
"github.com/google/cadvisor/nvm"
"github.com/google/cadvisor/perf"
Expand Down Expand Up @@ -1237,6 +1238,24 @@ func (m *manager) watchForNewOoms() error {
if err != nil {
klog.Errorf("failed to add OOM kill event for %q: %v", oomInstance.ContainerName, err)
}

// Count OOM events for later collection by prometheus
request := v2.RequestOptions{
IdType: v2.TypeName,
Count: 1,
}
conts, err := m.getRequestedContainers(oomInstance.ContainerName, request)
if err != nil {
klog.V(2).Infof("failed getting container info for %q: %v", oomInstance.ContainerName, err)
continue
}
if len(conts) != 1 {
klog.V(2).Info("Expected the request to match only one container")
continue
}
for _, cont := range conts {
atomic.AddUint64(&cont.oomEvents, 1)
}
}
}()
return nil
Expand Down
11 changes: 11 additions & 0 deletions metrics/prometheus.go
Original file line number Diff line number Diff line change
Expand Up @@ -1757,6 +1757,17 @@ func NewPrometheusCollector(i infoProvider, f ContainerLabelsFunc, includedMetri
},
}...)
}
if includedMetrics.Has(container.OOMMetrics) {
c.containerMetrics = append(c.containerMetrics, containerMetric{
name: "container_oom_events_total",
help: "Count of out of memory events observed for the container",
valueType: prometheus.CounterValue,
getValues: func(s *info.ContainerStats) metricValues {
return metricValues{{value: float64(s.OOMEvents), timestamp: s.Timestamp}}
},
})
}

return c
}

Expand Down
3 changes: 3 additions & 0 deletions metrics/testdata/prometheus_metrics
Original file line number Diff line number Diff line change
Expand Up @@ -358,6 +358,9 @@ container_network_udp_usage_total{container_env_foo_env="prod",container_label_f
container_network_udp_usage_total{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",name="testcontaineralias",udp_state="listen",zone_name="hello"} 0 1395066363000
container_network_udp_usage_total{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",name="testcontaineralias",udp_state="rxqueued",zone_name="hello"} 0 1395066363000
container_network_udp_usage_total{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",name="testcontaineralias",udp_state="txqueued",zone_name="hello"} 0 1395066363000
# HELP container_oom_events_total Count of out of memory events observed for the container
# TYPE container_oom_events_total counter
container_oom_events_total{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",name="testcontaineralias",zone_name="hello"} 0 1395066363000
# HELP container_perf_events_total Perf event metric.
# TYPE container_perf_events_total counter
container_perf_events_total{container_env_foo_env="prod",container_label_foo_label="bar",cpu="0",event="instructions",id="testcontainer",image="test",name="testcontaineralias",zone_name="hello"} 123 1395066363000
Expand Down