Skip to content

Commit 2f10b97

Browse files
author
Muhammad Shahzeb
authored
Add alerts to agent mixin used by Grafana cloud integration (#993)
* Add alerts to agent mixin used by Grafana integration * Fix jsonnet lint * Update alerts to be used in agent health integration
1 parent 48da183 commit 2f10b97

File tree

4 files changed

+85
-5
lines changed

4 files changed

+85
-5
lines changed
Lines changed: 78 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,78 @@
1+
{
2+
prometheusAlerts+:: {
3+
groups+: [
4+
{
5+
name: 'GrafanaAgentHealthChecks',
6+
rules: [
7+
{
8+
alert: 'GrafanaAgentDown',
9+
expr: |||
10+
up{
11+
job="integrations/agent",
12+
} == 0
13+
|||,
14+
'for': '5m',
15+
annotations: {
16+
summary: 'Grafana agent is down.',
17+
description: 'Grafana agent is down on {{ $labels.instance }} for the last 5 minutes',
18+
},
19+
labels: {
20+
severity: 'critical',
21+
},
22+
},
23+
{
24+
alert: 'GrafanaAgentUnstable',
25+
expr: |||
26+
avg_over_time(up{
27+
job="integrations/agent",
28+
}[5m]) < 1
29+
|||,
30+
'for': '15m',
31+
labels: {
32+
severity: 'warning',
33+
},
34+
annotations: {
35+
summary: 'Grafana agent is unstable.',
36+
description: 'Grafana agent is unstable or restarting on {{ $labels.instance }} over the last 15 minutes',
37+
},
38+
},
39+
{
40+
alert: 'GrafanaAgentCPUHigh',
41+
expr: |||
42+
(
43+
rate(process_cpu_seconds_total{
44+
job=~"integrations/agent"
45+
}[5m]) > %(alertsCriticalCpuUsage5m)s / 100
46+
)
47+
||| % $._config,
48+
'for': '5m',
49+
labels: {
50+
severity: 'warning',
51+
},
52+
annotations: {
53+
summary: 'Grafana agent high CPU usage.',
54+
description: 'Grafana agent is using more than %(alertsCriticalCpuUsage5m)s percent of CPU on {{ $labels.instance }} for the last 5 minutes' % $._config,
55+
},
56+
},
57+
{
58+
alert: 'GrafanaAgentMemHigh',
59+
expr: |||
60+
(
61+
sum without (instance) (go_memstats_heap_inuse_bytes{job=~"integrations/agent"}) /
62+
sum without (instance, instance_group_name) (agent_wal_storage_active_series{job=~"integrations/agent"}) / 1e3 > %(alertsCriticalMemUsage5m)s
63+
)
64+
||| % $._config,
65+
'for': '5m',
66+
labels: {
67+
severity: 'warning',
68+
},
69+
annotations: {
70+
summary: 'Grafana agent high memory usage.',
71+
description: 'Grafana agent is using more than %(alertsCriticalMemUsage5m)s of memory on {{ $labels.instance }} for the last 5 minutes' % $._config,
72+
},
73+
},
74+
],
75+
},
76+
],
77+
},
78+
}

grafana-agent-mixin/config.libsonnet

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,12 @@
11
{
2-
local makeGroupBy(groups) = std.join(', ', groups),
3-
42
_config+:: {
53
dashboardTags: ['grafana-agent'],
64
dashboardPeriod: 'now-1h',
75
dashboardRefresh: '1m',
86
dashboardTimezone: 'default',
7+
8+
//alert thresholds
9+
alertsCriticalCpuUsage5m: 80, //percent
10+
alertsCriticalMemUsage5m: 100, //kilo bytes per active series
911
},
1012
}

grafana-agent-mixin/dashboards/grafana-agent-debugging.libsonnet

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -100,7 +100,7 @@ local instance_template = grafana.template.new(
100100
'rate(process_cpu_seconds_total{' + host_matcher + '}[$__rate_interval])',
101101
legendFormat='{{instance}}',
102102
)) +
103-
utils.timeSeriesOverride(unit='percent');
103+
utils.timeSeriesOverride(unit='percentunit');
104104

105105
local TCPConnections =
106106
graphPanel.new(

grafana-agent-mixin/mixin.libsonnet

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,3 @@
1-
{ grafanaDashboardFolder: 'Grafana Agent' }
2-
+ (import 'dashboards/dashboards.libsonnet')
1+
(import 'dashboards/dashboards.libsonnet')
2+
+ (import 'alerts/alerts.libsonnet')
33
+ (import 'config.libsonnet')

0 commit comments

Comments
 (0)