You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
Copy file name to clipboardExpand all lines: alerts/apps_alerts.libsonnet
+50-16
Original file line number
Diff line number
Diff line change
@@ -23,7 +23,9 @@ local utils = import '../lib/utils.libsonnet';
23
23
severity:'warning',
24
24
},
25
25
annotations: {
26
-
description:'Pod {{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.container }}) is in waiting state (reason: "CrashLoopBackOff").',
26
+
description:'Pod {{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.container }}) is in waiting state (reason: "CrashLoopBackOff")%s.' % [
27
+
utils.ifShowMultiCluster($._config, ' on cluster {{ $labels.%(clusterLabel)s }}' % $._config),
28
+
],
27
29
summary:'Pod is crash looping.',
28
30
},
29
31
'for': '15m',
@@ -47,7 +49,9 @@ local utils = import '../lib/utils.libsonnet';
47
49
severity:'warning',
48
50
},
49
51
annotations: {
50
-
description:'Pod {{ $labels.namespace }}/{{ $labels.pod }} has been in a non-ready state for longer than 15 minutes.',
52
+
description:'Pod {{ $labels.namespace }}/{{ $labels.pod }} has been in a non-ready state for longer than 15 minutes%s.' % [
53
+
utils.ifShowMultiCluster($._config, ' on cluster {{ $labels.%(clusterLabel)s }}' % $._config),
54
+
],
51
55
summary:'Pod has been in a non-ready state for more than 15 minutes.',
52
56
},
53
57
'for': '15m',
@@ -63,7 +67,9 @@ local utils = import '../lib/utils.libsonnet';
63
67
severity:'warning',
64
68
},
65
69
annotations: {
66
-
description:'Deployment generation for {{ $labels.namespace }}/{{ $labels.deployment }} does not match, this indicates that the Deployment has failed but has not been rolled back.',
70
+
description:'Deployment generation for {{ $labels.namespace }}/{{ $labels.deployment }} does not match, this indicates that the Deployment has failed but has not been rolled back%s.' % [
71
+
utils.ifShowMultiCluster($._config, ' on cluster {{ $labels.%(clusterLabel)s }}' % $._config),
72
+
],
67
73
summary:'Deployment generation mismatch due to possible roll-back',
68
74
},
69
75
'for': '15m',
@@ -85,7 +91,9 @@ local utils = import '../lib/utils.libsonnet';
85
91
severity:'warning',
86
92
},
87
93
annotations: {
88
-
description:'Deployment {{ $labels.namespace }}/{{ $labels.deployment }} has not matched the expected number of replicas for longer than 15 minutes.',
94
+
description:'Deployment {{ $labels.namespace }}/{{ $labels.deployment }} has not matched the expected number of replicas for longer than 15 minutes%s.' % [
95
+
utils.ifShowMultiCluster($._config, ' on cluster {{ $labels.%(clusterLabel)s }}' % $._config),
96
+
],
89
97
summary:'Deployment has not matched the expected number of replicas.',
90
98
},
91
99
'for': '15m',
@@ -100,7 +108,9 @@ local utils = import '../lib/utils.libsonnet';
100
108
severity:'warning',
101
109
},
102
110
annotations: {
103
-
description:'Rollout of deployment {{ $labels.namespace }}/{{ $labels.deployment }} is not progressing for longer than 15 minutes.',
111
+
description:'Rollout of deployment {{ $labels.namespace }}/{{ $labels.deployment }} is not progressing for longer than 15 minutes%s.' % [
112
+
utils.ifShowMultiCluster($._config, ' on cluster {{ $labels.%(clusterLabel)s }}' % $._config),
113
+
],
104
114
summary:'Deployment rollout is not progressing.',
105
115
},
106
116
'for': '15m',
@@ -122,7 +132,9 @@ local utils = import '../lib/utils.libsonnet';
122
132
severity:'warning',
123
133
},
124
134
annotations: {
125
-
description:'StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} has not matched the expected number of replicas for longer than 15 minutes.',
135
+
description:'StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} has not matched the expected number of replicas for longer than 15 minutes%s.' % [
136
+
utils.ifShowMultiCluster($._config, ' on cluster {{ $labels.%(clusterLabel)s }}' % $._config),
137
+
],
126
138
summary:'StatefulSet has not matched the expected number of replicas.',
127
139
},
128
140
'for': '15m',
@@ -138,7 +150,9 @@ local utils = import '../lib/utils.libsonnet';
138
150
severity:'warning',
139
151
},
140
152
annotations: {
141
-
description:'StatefulSet generation for {{ $labels.namespace }}/{{ $labels.statefulset }} does not match, this indicates that the StatefulSet has failed but has not been rolled back.',
153
+
description:'StatefulSet generation for {{ $labels.namespace }}/{{ $labels.statefulset }} does not match, this indicates that the StatefulSet has failed but has not been rolled back%s.' % [
154
+
utils.ifShowMultiCluster($._config, ' on cluster {{ $labels.%(clusterLabel)s }}' % $._config),
155
+
],
142
156
summary:'StatefulSet generation mismatch due to possible roll-back',
143
157
},
144
158
'for': '15m',
@@ -168,7 +182,9 @@ local utils = import '../lib/utils.libsonnet';
168
182
severity:'warning',
169
183
},
170
184
annotations: {
171
-
description:'StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} update has not been rolled out.',
185
+
description:'StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} update has not been rolled out%s.' % [
186
+
utils.ifShowMultiCluster($._config, ' on cluster {{ $labels.%(clusterLabel)s }}' % $._config),
187
+
],
172
188
summary:'StatefulSet update has not been rolled out.',
173
189
},
174
190
'for': '15m',
@@ -205,7 +221,10 @@ local utils = import '../lib/utils.libsonnet';
205
221
severity:'warning',
206
222
},
207
223
annotations: {
208
-
description:'DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} has not finished or progressed for at least %(kubeDaemonSetRolloutStuckFor)s.' % $._config,
224
+
description:'DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} has not finished or progressed for at least %s%s.' % [
225
+
$._config.kubeDaemonSetRolloutStuckFor,
226
+
utils.ifShowMultiCluster($._config, ' on cluster {{ $labels.%(clusterLabel)s }}' % $._config),
227
+
],
209
228
summary:'DaemonSet rollout is stuck.',
210
229
},
211
230
'for': $._config.kubeDaemonSetRolloutStuckFor,
@@ -218,7 +237,9 @@ local utils = import '../lib/utils.libsonnet';
218
237
severity:'warning',
219
238
},
220
239
annotations: {
221
-
description:'pod/{{ $labels.pod }} in namespace {{ $labels.namespace }} on container {{ $labels.container}} has been in waiting state for longer than 1 hour. (reason: "{{ $labels.reason }}").',
240
+
description:'pod/{{ $labels.pod }} in namespace {{ $labels.namespace }} on container {{ $labels.container}} has been in waiting state for longer than 1 hour. (reason: "{{ $labels.reason }}")%s.' % [
241
+
utils.ifShowMultiCluster($._config, ' on cluster {{ $labels.%(clusterLabel)s }}' % $._config),
242
+
],
222
243
summary:'Pod container waiting longer than 1 hour',
223
244
},
224
245
'for': '1h',
@@ -235,7 +256,9 @@ local utils = import '../lib/utils.libsonnet';
235
256
severity:'warning',
236
257
},
237
258
annotations: {
238
-
description:'{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} are not scheduled.',
259
+
description:'{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} are not scheduled%s.' % [
260
+
utils.ifShowMultiCluster($._config, ' on cluster {{ $labels.%(clusterLabel)s }}' % $._config),
261
+
],
239
262
summary:'DaemonSet pods are not scheduled.',
240
263
},
241
264
'for': '10m',
@@ -249,7 +272,9 @@ local utils = import '../lib/utils.libsonnet';
249
272
severity:'warning',
250
273
},
251
274
annotations: {
252
-
description:'{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} are running where they are not supposed to run.',
275
+
description:'{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} are running where they are not supposed to run%s.' % [
276
+
utils.ifShowMultiCluster($._config, ' on cluster {{ $labels.%(clusterLabel)s }}' % $._config),
277
+
],
253
278
summary:'DaemonSet pods are misscheduled.',
254
279
},
255
280
'for': '15m',
@@ -265,7 +290,10 @@ local utils = import '../lib/utils.libsonnet';
265
290
severity:'warning',
266
291
},
267
292
annotations: {
268
-
description:'Job {{ $labels.namespace }}/{{ $labels.job_name }} is taking more than {{ "%(kubeJobTimeoutDuration)s" | humanizeDuration }} to complete.' % $._config,
293
+
description:'Job {{ $labels.namespace }}/{{ $labels.job_name }} is taking more than {{ "%s" | humanizeDuration }} to complete%s.' % [
294
+
$._config.kubeJobTimeoutDuration,
295
+
utils.ifShowMultiCluster($._config, ' on cluster {{ $labels.%(clusterLabel)s }}' % $._config),
296
+
],
269
297
summary:'Job did not complete in time',
270
298
},
271
299
},
@@ -279,7 +307,9 @@ local utils = import '../lib/utils.libsonnet';
279
307
severity:'warning',
280
308
},
281
309
annotations: {
282
-
description:'Job {{ $labels.namespace }}/{{ $labels.job_name }} failed to complete. Removing failed job after investigation should clear this alert.',
310
+
description:'Job {{ $labels.namespace }}/{{ $labels.job_name }} failed to complete. Removing failed job after investigation should clear this alert%s.' % [
311
+
utils.ifShowMultiCluster($._config, ' on cluster {{ $labels.%(clusterLabel)s }}' % $._config),
312
+
],
283
313
summary:'Job failed to complete.',
284
314
},
285
315
},
@@ -303,7 +333,9 @@ local utils = import '../lib/utils.libsonnet';
303
333
severity:'warning',
304
334
},
305
335
annotations: {
306
-
description:'HPA {{ $labels.namespace }}/{{ $labels.horizontalpodautoscaler }} has not matched the desired number of replicas for longer than 15 minutes.',
336
+
description:'HPA {{ $labels.namespace }}/{{ $labels.horizontalpodautoscaler }} has not matched the desired number of replicas for longer than 15 minutes%s.' % [
337
+
utils.ifShowMultiCluster($._config, ' on cluster {{ $labels.%(clusterLabel)s }}' % $._config),
338
+
],
307
339
summary:'HPA has not matched desired number of replicas.',
308
340
},
309
341
'for': '15m',
@@ -319,7 +351,9 @@ local utils = import '../lib/utils.libsonnet';
319
351
severity:'warning',
320
352
},
321
353
annotations: {
322
-
description:'HPA {{ $labels.namespace }}/{{ $labels.horizontalpodautoscaler }} has been running at max replicas for longer than 15 minutes.',
354
+
description:'HPA {{ $labels.namespace }}/{{ $labels.horizontalpodautoscaler }} has been running at max replicas for longer than 15 minutes%s.' % [
355
+
utils.ifShowMultiCluster($._config, ' on cluster {{ $labels.%(clusterLabel)s }}' % $._config),
Copy file name to clipboardExpand all lines: alerts/kube_apiserver.libsonnet
+9-3
Original file line number
Diff line number
Diff line change
@@ -36,7 +36,9 @@ local utils = import '../lib/utils.libsonnet';
36
36
long:'%(long)s' % w,
37
37
},
38
38
annotations: {
39
-
description:'The API server is burning too much error budget.',
39
+
description:'The API server is burning too much error budget%s.' % [
40
+
utils.ifShowMultiCluster($._config, ' on cluster {{ $labels.%(clusterLabel)s }}' % $._config),
41
+
],
40
42
summary:'The API server is burning too much error budget.',
41
43
},
42
44
'for': '%(for)s' % w,
@@ -111,7 +113,9 @@ local utils = import '../lib/utils.libsonnet';
111
113
severity:'warning',
112
114
},
113
115
annotations: {
114
-
description:'Kubernetes aggregated API {{ $labels.name }}/{{ $labels.namespace }} has been only {{ $value | humanize }}% available over the last 10m.',
116
+
description:'Kubernetes aggregated API {{ $labels.name }}/{{ $labels.namespace }} has been only {{ $value | humanize }}%% available over the last 10m%s.' % [
117
+
utils.ifShowMultiCluster($._config, ' on cluster {{ $labels.%(clusterLabel)s }}' % $._config),
118
+
],
115
119
summary:'Kubernetes aggregated API is down.',
116
120
},
117
121
},
@@ -128,7 +132,9 @@ local utils = import '../lib/utils.libsonnet';
128
132
severity:'warning',
129
133
},
130
134
annotations: {
131
-
description:'The kubernetes apiserver has terminated {{ $value | humanizePercentage }} of its incoming requests.',
135
+
description:'The kubernetes apiserver has terminated {{ $value | humanizePercentage }} of its incoming requests%s.' % [
136
+
utils.ifShowMultiCluster($._config, ' on cluster {{ $labels.%(clusterLabel)s }}' % $._config),
137
+
],
132
138
summary:'The kubernetes apiserver has terminated {{ $value | humanizePercentage }} of its incoming requests.',
description:'The readiness status of node {{ $labels.node }} has changed {{ $value }} times in the last 15 minutes.',
98
+
description:'The readiness status of node {{ $labels.node }} has changed {{ $value }} times in the last 15 minutes%s.' % [
99
+
utils.ifShowMultiCluster($._config, ' on cluster {{ $labels.%(clusterLabel)s }}' % $._config),
100
+
],
83
101
summary:'Node readiness status is flapping.',
84
102
},
85
103
},
@@ -93,7 +111,9 @@
93
111
severity:'warning',
94
112
},
95
113
annotations: {
96
-
description:'The Kubelet Pod Lifecycle Event Generator has a 99th percentile duration of {{ $value }} seconds on node {{ $labels.node }}.',
114
+
description:'The Kubelet Pod Lifecycle Event Generator has a 99th percentile duration of {{ $value }} seconds on node {{ $labels.node }}%s.' % [
115
+
utils.ifShowMultiCluster($._config, ' on cluster {{ $labels.%(clusterLabel)s }}' % $._config),
116
+
],
97
117
summary:'Kubelet Pod Lifecycle Event Generator is taking too long to relist.',
98
118
},
99
119
},
@@ -107,7 +127,9 @@
107
127
severity:'warning',
108
128
},
109
129
annotations: {
110
-
description:'Kubelet Pod startup 99th percentile latency is {{ $value }} seconds on node {{ $labels.node }}.',
130
+
description:'Kubelet Pod startup 99th percentile latency is {{ $value }} seconds on node {{ $labels.node }}%s.' % [
131
+
utils.ifShowMultiCluster($._config, ' on cluster {{ $labels.%(clusterLabel)s }}' % $._config),
132
+
],
111
133
summary:'Kubelet Pod startup latency is too high.',
112
134
},
113
135
},
@@ -120,7 +142,9 @@
120
142
severity:'warning',
121
143
},
122
144
annotations: {
123
-
description:'Client certificate for Kubelet on node {{ $labels.node }} expires in {{ $value | humanizeDuration }}.',
145
+
description:'Client certificate for Kubelet on node {{ $labels.node }} expires in {{ $value | humanizeDuration }}%s.' % [
146
+
utils.ifShowMultiCluster($._config, ' on cluster {{ $labels.%(clusterLabel)s }}' % $._config),
147
+
],
124
148
summary:'Kubelet client certificate is about to expire.',
125
149
},
126
150
},
@@ -133,7 +157,9 @@
133
157
severity:'critical',
134
158
},
135
159
annotations: {
136
-
description:'Client certificate for Kubelet on node {{ $labels.node }} expires in {{ $value | humanizeDuration }}.',
160
+
description:'Client certificate for Kubelet on node {{ $labels.node }} expires in {{ $value | humanizeDuration }}%s.' % [
161
+
utils.ifShowMultiCluster($._config, ' on cluster {{ $labels.%(clusterLabel)s }}' % $._config),
162
+
],
137
163
summary:'Kubelet client certificate is about to expire.',
138
164
},
139
165
},
@@ -146,7 +172,9 @@
146
172
severity:'warning',
147
173
},
148
174
annotations: {
149
-
description:'Server certificate for Kubelet on node {{ $labels.node }} expires in {{ $value | humanizeDuration }}.',
175
+
description:'Server certificate for Kubelet on node {{ $labels.node }} expires in {{ $value | humanizeDuration }}%s.' % [
176
+
utils.ifShowMultiCluster($._config, ' on cluster {{ $labels.%(clusterLabel)s }}' % $._config),
177
+
],
150
178
summary:'Kubelet server certificate is about to expire.',
151
179
},
152
180
},
@@ -159,7 +187,9 @@
159
187
severity:'critical',
160
188
},
161
189
annotations: {
162
-
description:'Server certificate for Kubelet on node {{ $labels.node }} expires in {{ $value | humanizeDuration }}.',
190
+
description:'Server certificate for Kubelet on node {{ $labels.node }} expires in {{ $value | humanizeDuration }}%s.' % [
191
+
utils.ifShowMultiCluster($._config, ' on cluster {{ $labels.%(clusterLabel)s }}' % $._config),
192
+
],
163
193
summary:'Kubelet server certificate is about to expire.',
164
194
},
165
195
},
@@ -173,7 +203,9 @@
173
203
},
174
204
'for': '15m',
175
205
annotations: {
176
-
description:'Kubelet on node {{ $labels.node }} has failed to renew its client certificate ({{ $value | humanize }} errors in the last 5 minutes).',
206
+
description:'Kubelet on node {{ $labels.node }} has failed to renew its client certificate ({{ $value | humanize }} errors in the last 5 minutes)%s.' % [
207
+
utils.ifShowMultiCluster($._config, ' on cluster {{ $labels.%(clusterLabel)s }}' % $._config),
208
+
],
177
209
summary:'Kubelet has failed to renew its client certificate.',
178
210
},
179
211
},
@@ -187,7 +219,9 @@
187
219
},
188
220
'for': '15m',
189
221
annotations: {
190
-
description:'Kubelet on node {{ $labels.node }} has failed to renew its server certificate ({{ $value | humanize }} errors in the last 5 minutes).',
222
+
description:'Kubelet on node {{ $labels.node }} has failed to renew its server certificate ({{ $value | humanize }} errors in the last 5 minutes)%s.' % [
223
+
utils.ifShowMultiCluster($._config, ' on cluster {{ $labels.%(clusterLabel)s }}' % $._config),
224
+
],
191
225
summary:'Kubelet has failed to renew its server certificate.',
Copy file name to clipboardExpand all lines: alerts/resource_alerts.libsonnet
+14-4
Original file line number
Diff line number
Diff line change
@@ -1,3 +1,5 @@
1
+
local utils = import'../lib/utils.libsonnet';
2
+
1
3
{
2
4
_config+:: {
3
5
kubeStateMetricsSelector:error'must provide selector for kube-state-metrics',
@@ -189,7 +191,9 @@
189
191
severity:'info',
190
192
},
191
193
annotations: {
192
-
description:'Namespace {{ $labels.namespace }} is using {{ $value | humanizePercentage }} of its {{ $labels.resource }} quota.',
194
+
description:'Namespace {{ $labels.namespace }} is using {{ $value | humanizePercentage }} of its {{ $labels.resource }} quota%s.' % [
195
+
utils.ifShowMultiCluster($._config, ' on cluster {{ $labels.%(clusterLabel)s }}' % $._config),
196
+
],
193
197
summary:'Namespace quota is going to be full.',
194
198
},
195
199
},
@@ -206,7 +210,9 @@
206
210
severity:'info',
207
211
},
208
212
annotations: {
209
-
description:'Namespace {{ $labels.namespace }} is using {{ $value | humanizePercentage }} of its {{ $labels.resource }} quota.',
213
+
description:'Namespace {{ $labels.namespace }} is using {{ $value | humanizePercentage }} of its {{ $labels.resource }} quota%s.' % [
214
+
utils.ifShowMultiCluster($._config, ' on cluster {{ $labels.%(clusterLabel)s }}' % $._config),
215
+
],
210
216
summary:'Namespace quota is fully used.',
211
217
},
212
218
},
@@ -223,7 +229,9 @@
223
229
severity:'warning',
224
230
},
225
231
annotations: {
226
-
description:'Namespace {{ $labels.namespace }} is using {{ $value | humanizePercentage }} of its {{ $labels.resource }} quota.',
232
+
description:'Namespace {{ $labels.namespace }} is using {{ $value | humanizePercentage }} of its {{ $labels.resource }} quota%s.' % [
233
+
utils.ifShowMultiCluster($._config, ' on cluster {{ $labels.%(clusterLabel)s }}' % $._config),
234
+
],
227
235
summary:'Namespace quota has exceeded the limits.',
228
236
},
229
237
},
@@ -240,7 +248,9 @@
240
248
severity:'info',
241
249
},
242
250
annotations: {
243
-
description:'{{ $value | humanizePercentage }} throttling of CPU in namespace {{ $labels.namespace }} for container {{ $labels.container }} in pod {{ $labels.pod }}.',
251
+
description:'{{ $value | humanizePercentage }} throttling of CPU in namespace {{ $labels.namespace }} for container {{ $labels.container }} in pod {{ $labels.pod }}%s.' % [
252
+
utils.ifShowMultiCluster($._config, ' on cluster {{ $labels.%(clusterLabel)s }}' % $._config),
253
+
],
244
254
summary:'Processes experience elevated CPU throttling.',
0 commit comments