Skip to content
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.

Commit 2118a9e

Browse files
committedJan 23, 2025··
Merge branch 'master' into KubeCPUOvercommit-SNO
2 parents 3cd8f3c + 9ceec88 commit 2118a9e

6 files changed

+172
-41
lines changed
 

‎alerts/apps_alerts.libsonnet

+50-16
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,9 @@ local utils = import '../lib/utils.libsonnet';
2323
severity: 'warning',
2424
},
2525
annotations: {
26-
description: 'Pod {{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.container }}) is in waiting state (reason: "CrashLoopBackOff").',
26+
description: 'Pod {{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.container }}) is in waiting state (reason: "CrashLoopBackOff")%s.' % [
27+
utils.ifShowMultiCluster($._config, ' on cluster {{ $labels.%(clusterLabel)s }}' % $._config),
28+
],
2729
summary: 'Pod is crash looping.',
2830
},
2931
'for': '15m',
@@ -47,7 +49,9 @@ local utils = import '../lib/utils.libsonnet';
4749
severity: 'warning',
4850
},
4951
annotations: {
50-
description: 'Pod {{ $labels.namespace }}/{{ $labels.pod }} has been in a non-ready state for longer than 15 minutes.',
52+
description: 'Pod {{ $labels.namespace }}/{{ $labels.pod }} has been in a non-ready state for longer than 15 minutes%s.' % [
53+
utils.ifShowMultiCluster($._config, ' on cluster {{ $labels.%(clusterLabel)s }}' % $._config),
54+
],
5155
summary: 'Pod has been in a non-ready state for more than 15 minutes.',
5256
},
5357
'for': '15m',
@@ -63,7 +67,9 @@ local utils = import '../lib/utils.libsonnet';
6367
severity: 'warning',
6468
},
6569
annotations: {
66-
description: 'Deployment generation for {{ $labels.namespace }}/{{ $labels.deployment }} does not match, this indicates that the Deployment has failed but has not been rolled back.',
70+
description: 'Deployment generation for {{ $labels.namespace }}/{{ $labels.deployment }} does not match, this indicates that the Deployment has failed but has not been rolled back%s.' % [
71+
utils.ifShowMultiCluster($._config, ' on cluster {{ $labels.%(clusterLabel)s }}' % $._config),
72+
],
6773
summary: 'Deployment generation mismatch due to possible roll-back',
6874
},
6975
'for': '15m',
@@ -85,7 +91,9 @@ local utils = import '../lib/utils.libsonnet';
8591
severity: 'warning',
8692
},
8793
annotations: {
88-
description: 'Deployment {{ $labels.namespace }}/{{ $labels.deployment }} has not matched the expected number of replicas for longer than 15 minutes.',
94+
description: 'Deployment {{ $labels.namespace }}/{{ $labels.deployment }} has not matched the expected number of replicas for longer than 15 minutes%s.' % [
95+
utils.ifShowMultiCluster($._config, ' on cluster {{ $labels.%(clusterLabel)s }}' % $._config),
96+
],
8997
summary: 'Deployment has not matched the expected number of replicas.',
9098
},
9199
'for': '15m',
@@ -100,7 +108,9 @@ local utils = import '../lib/utils.libsonnet';
100108
severity: 'warning',
101109
},
102110
annotations: {
103-
description: 'Rollout of deployment {{ $labels.namespace }}/{{ $labels.deployment }} is not progressing for longer than 15 minutes.',
111+
description: 'Rollout of deployment {{ $labels.namespace }}/{{ $labels.deployment }} is not progressing for longer than 15 minutes%s.' % [
112+
utils.ifShowMultiCluster($._config, ' on cluster {{ $labels.%(clusterLabel)s }}' % $._config),
113+
],
104114
summary: 'Deployment rollout is not progressing.',
105115
},
106116
'for': '15m',
@@ -122,7 +132,9 @@ local utils = import '../lib/utils.libsonnet';
122132
severity: 'warning',
123133
},
124134
annotations: {
125-
description: 'StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} has not matched the expected number of replicas for longer than 15 minutes.',
135+
description: 'StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} has not matched the expected number of replicas for longer than 15 minutes%s.' % [
136+
utils.ifShowMultiCluster($._config, ' on cluster {{ $labels.%(clusterLabel)s }}' % $._config),
137+
],
126138
summary: 'StatefulSet has not matched the expected number of replicas.',
127139
},
128140
'for': '15m',
@@ -138,7 +150,9 @@ local utils = import '../lib/utils.libsonnet';
138150
severity: 'warning',
139151
},
140152
annotations: {
141-
description: 'StatefulSet generation for {{ $labels.namespace }}/{{ $labels.statefulset }} does not match, this indicates that the StatefulSet has failed but has not been rolled back.',
153+
description: 'StatefulSet generation for {{ $labels.namespace }}/{{ $labels.statefulset }} does not match, this indicates that the StatefulSet has failed but has not been rolled back%s.' % [
154+
utils.ifShowMultiCluster($._config, ' on cluster {{ $labels.%(clusterLabel)s }}' % $._config),
155+
],
142156
summary: 'StatefulSet generation mismatch due to possible roll-back',
143157
},
144158
'for': '15m',
@@ -168,7 +182,9 @@ local utils = import '../lib/utils.libsonnet';
168182
severity: 'warning',
169183
},
170184
annotations: {
171-
description: 'StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} update has not been rolled out.',
185+
description: 'StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} update has not been rolled out%s.' % [
186+
utils.ifShowMultiCluster($._config, ' on cluster {{ $labels.%(clusterLabel)s }}' % $._config),
187+
],
172188
summary: 'StatefulSet update has not been rolled out.',
173189
},
174190
'for': '15m',
@@ -205,7 +221,10 @@ local utils = import '../lib/utils.libsonnet';
205221
severity: 'warning',
206222
},
207223
annotations: {
208-
description: 'DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} has not finished or progressed for at least %(kubeDaemonSetRolloutStuckFor)s.' % $._config,
224+
description: 'DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} has not finished or progressed for at least %s%s.' % [
225+
$._config.kubeDaemonSetRolloutStuckFor,
226+
utils.ifShowMultiCluster($._config, ' on cluster {{ $labels.%(clusterLabel)s }}' % $._config),
227+
],
209228
summary: 'DaemonSet rollout is stuck.',
210229
},
211230
'for': $._config.kubeDaemonSetRolloutStuckFor,
@@ -218,7 +237,9 @@ local utils = import '../lib/utils.libsonnet';
218237
severity: 'warning',
219238
},
220239
annotations: {
221-
description: 'pod/{{ $labels.pod }} in namespace {{ $labels.namespace }} on container {{ $labels.container}} has been in waiting state for longer than 1 hour. (reason: "{{ $labels.reason }}").',
240+
description: 'pod/{{ $labels.pod }} in namespace {{ $labels.namespace }} on container {{ $labels.container}} has been in waiting state for longer than 1 hour. (reason: "{{ $labels.reason }}")%s.' % [
241+
utils.ifShowMultiCluster($._config, ' on cluster {{ $labels.%(clusterLabel)s }}' % $._config),
242+
],
222243
summary: 'Pod container waiting longer than 1 hour',
223244
},
224245
'for': '1h',
@@ -235,7 +256,9 @@ local utils = import '../lib/utils.libsonnet';
235256
severity: 'warning',
236257
},
237258
annotations: {
238-
description: '{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} are not scheduled.',
259+
description: '{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} are not scheduled%s.' % [
260+
utils.ifShowMultiCluster($._config, ' on cluster {{ $labels.%(clusterLabel)s }}' % $._config),
261+
],
239262
summary: 'DaemonSet pods are not scheduled.',
240263
},
241264
'for': '10m',
@@ -249,7 +272,9 @@ local utils = import '../lib/utils.libsonnet';
249272
severity: 'warning',
250273
},
251274
annotations: {
252-
description: '{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} are running where they are not supposed to run.',
275+
description: '{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} are running where they are not supposed to run%s.' % [
276+
utils.ifShowMultiCluster($._config, ' on cluster {{ $labels.%(clusterLabel)s }}' % $._config),
277+
],
253278
summary: 'DaemonSet pods are misscheduled.',
254279
},
255280
'for': '15m',
@@ -265,7 +290,10 @@ local utils = import '../lib/utils.libsonnet';
265290
severity: 'warning',
266291
},
267292
annotations: {
268-
description: 'Job {{ $labels.namespace }}/{{ $labels.job_name }} is taking more than {{ "%(kubeJobTimeoutDuration)s" | humanizeDuration }} to complete.' % $._config,
293+
description: 'Job {{ $labels.namespace }}/{{ $labels.job_name }} is taking more than {{ "%s" | humanizeDuration }} to complete%s.' % [
294+
$._config.kubeJobTimeoutDuration,
295+
utils.ifShowMultiCluster($._config, ' on cluster {{ $labels.%(clusterLabel)s }}' % $._config),
296+
],
269297
summary: 'Job did not complete in time',
270298
},
271299
},
@@ -279,7 +307,9 @@ local utils = import '../lib/utils.libsonnet';
279307
severity: 'warning',
280308
},
281309
annotations: {
282-
description: 'Job {{ $labels.namespace }}/{{ $labels.job_name }} failed to complete. Removing failed job after investigation should clear this alert.',
310+
description: 'Job {{ $labels.namespace }}/{{ $labels.job_name }} failed to complete. Removing failed job after investigation should clear this alert%s.' % [
311+
utils.ifShowMultiCluster($._config, ' on cluster {{ $labels.%(clusterLabel)s }}' % $._config),
312+
],
283313
summary: 'Job failed to complete.',
284314
},
285315
},
@@ -303,7 +333,9 @@ local utils = import '../lib/utils.libsonnet';
303333
severity: 'warning',
304334
},
305335
annotations: {
306-
description: 'HPA {{ $labels.namespace }}/{{ $labels.horizontalpodautoscaler }} has not matched the desired number of replicas for longer than 15 minutes.',
336+
description: 'HPA {{ $labels.namespace }}/{{ $labels.horizontalpodautoscaler }} has not matched the desired number of replicas for longer than 15 minutes%s.' % [
337+
utils.ifShowMultiCluster($._config, ' on cluster {{ $labels.%(clusterLabel)s }}' % $._config),
338+
],
307339
summary: 'HPA has not matched desired number of replicas.',
308340
},
309341
'for': '15m',
@@ -319,7 +351,9 @@ local utils = import '../lib/utils.libsonnet';
319351
severity: 'warning',
320352
},
321353
annotations: {
322-
description: 'HPA {{ $labels.namespace }}/{{ $labels.horizontalpodautoscaler }} has been running at max replicas for longer than 15 minutes.',
354+
description: 'HPA {{ $labels.namespace }}/{{ $labels.horizontalpodautoscaler }} has been running at max replicas for longer than 15 minutes%s.' % [
355+
utils.ifShowMultiCluster($._config, ' on cluster {{ $labels.%(clusterLabel)s }}' % $._config),
356+
],
323357
summary: 'HPA is running at max replicas',
324358
},
325359
'for': '15m',

‎alerts/kube_apiserver.libsonnet

+9-3
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,9 @@ local utils = import '../lib/utils.libsonnet';
3636
long: '%(long)s' % w,
3737
},
3838
annotations: {
39-
description: 'The API server is burning too much error budget.',
39+
description: 'The API server is burning too much error budget%s.' % [
40+
utils.ifShowMultiCluster($._config, ' on cluster {{ $labels.%(clusterLabel)s }}' % $._config),
41+
],
4042
summary: 'The API server is burning too much error budget.',
4143
},
4244
'for': '%(for)s' % w,
@@ -111,7 +113,9 @@ local utils = import '../lib/utils.libsonnet';
111113
severity: 'warning',
112114
},
113115
annotations: {
114-
description: 'Kubernetes aggregated API {{ $labels.name }}/{{ $labels.namespace }} has been only {{ $value | humanize }}% available over the last 10m.',
116+
description: 'Kubernetes aggregated API {{ $labels.name }}/{{ $labels.namespace }} has been only {{ $value | humanize }}%% available over the last 10m%s.' % [
117+
utils.ifShowMultiCluster($._config, ' on cluster {{ $labels.%(clusterLabel)s }}' % $._config),
118+
],
115119
summary: 'Kubernetes aggregated API is down.',
116120
},
117121
},
@@ -128,7 +132,9 @@ local utils = import '../lib/utils.libsonnet';
128132
severity: 'warning',
129133
},
130134
annotations: {
131-
description: 'The kubernetes apiserver has terminated {{ $value | humanizePercentage }} of its incoming requests.',
135+
description: 'The kubernetes apiserver has terminated {{ $value | humanizePercentage }} of its incoming requests%s.' % [
136+
utils.ifShowMultiCluster($._config, ' on cluster {{ $labels.%(clusterLabel)s }}' % $._config),
137+
],
132138
summary: 'The kubernetes apiserver has terminated {{ $value | humanizePercentage }} of its incoming requests.',
133139
},
134140
'for': '5m',

‎alerts/kubelet.libsonnet

+50-16
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
local utils = import '../lib/utils.libsonnet';
2+
13
{
24
_config+:: {
35
kubeStateMetricsSelector: error 'must provide selector for kube-state-metrics',
@@ -20,12 +22,16 @@
2022
{
2123
expr: |||
2224
kube_node_status_condition{%(kubeStateMetricsSelector)s,condition="Ready",status="true"} == 0
25+
and on (%(clusterLabel)s, node)
26+
kube_node_spec_unschedulable{%(kubeStateMetricsSelector)s} == 0
2327
||| % $._config,
2428
labels: {
2529
severity: 'warning',
2630
},
2731
annotations: {
28-
description: '{{ $labels.node }} has been unready for more than 15 minutes.',
32+
description: '{{ $labels.node }} has been unready for more than 15 minutes%s.' % [
33+
utils.ifShowMultiCluster($._config, ' on cluster {{ $labels.%(clusterLabel)s }}' % $._config),
34+
],
2935
summary: 'Node is not ready.',
3036
},
3137
'for': '15m',
@@ -41,7 +47,9 @@
4147
severity: 'warning',
4248
},
4349
annotations: {
44-
description: '{{ $labels.node }} is unreachable and some workloads may be rescheduled.',
50+
description: '{{ $labels.node }} is unreachable and some workloads may be rescheduled%s.' % [
51+
utils.ifShowMultiCluster($._config, ' on cluster {{ $labels.%(clusterLabel)s }}' % $._config),
52+
],
4553
summary: 'Node is unreachable.',
4654
},
4755
'for': '15m',
@@ -52,34 +60,44 @@
5260
// Some node has a capacity of 1 like AWS's Fargate and only exists while a pod is running on it.
5361
// We have to ignore this special node in the KubeletTooManyPods alert.
5462
expr: |||
55-
count by(%(clusterLabel)s, node) (
56-
(kube_pod_status_phase{%(kubeStateMetricsSelector)s,phase="Running"} == 1) * on(instance,pod,namespace,%(clusterLabel)s) group_left(node) topk by(instance,pod,namespace,%(clusterLabel)s) (1, kube_pod_info{%(kubeStateMetricsSelector)s})
63+
count by (%(clusterLabel)s, node) (
64+
(kube_pod_status_phase{%(kubeStateMetricsSelector)s, phase="Running"} == 1)
65+
* on (%(clusterLabel)s, namespace, pod) group_left (node)
66+
group by (%(clusterLabel)s, namespace, pod, node) (
67+
kube_pod_info{%(kubeStateMetricsSelector)s}
68+
)
5769
)
5870
/
59-
max by(%(clusterLabel)s, node) (
60-
kube_node_status_capacity{%(kubeStateMetricsSelector)s,resource="pods"} != 1
71+
max by (%(clusterLabel)s, node) (
72+
kube_node_status_capacity{%(kubeStateMetricsSelector)s, resource="pods"} != 1
6173
) > 0.95
6274
||| % $._config,
6375
'for': '15m',
6476
labels: {
6577
severity: 'info',
6678
},
6779
annotations: {
68-
description: "Kubelet '{{ $labels.node }}' is running at {{ $value | humanizePercentage }} of its Pod capacity.",
80+
description: "Kubelet '{{ $labels.node }}' is running at {{ $value | humanizePercentage }} of its Pod capacity%s." % [
81+
utils.ifShowMultiCluster($._config, ' on cluster {{ $labels.%(clusterLabel)s }}' % $._config),
82+
],
6983
summary: 'Kubelet is running at capacity.',
7084
},
7185
},
7286
{
7387
alert: 'KubeNodeReadinessFlapping',
7488
expr: |||
7589
sum(changes(kube_node_status_condition{%(kubeStateMetricsSelector)s,status="true",condition="Ready"}[15m])) by (%(clusterLabel)s, node) > 2
90+
and on (%(clusterLabel)s, node)
91+
kube_node_spec_unschedulable{%(kubeStateMetricsSelector)s} == 0
7692
||| % $._config,
7793
'for': '15m',
7894
labels: {
7995
severity: 'warning',
8096
},
8197
annotations: {
82-
description: 'The readiness status of node {{ $labels.node }} has changed {{ $value }} times in the last 15 minutes.',
98+
description: 'The readiness status of node {{ $labels.node }} has changed {{ $value }} times in the last 15 minutes%s.' % [
99+
utils.ifShowMultiCluster($._config, ' on cluster {{ $labels.%(clusterLabel)s }}' % $._config),
100+
],
83101
summary: 'Node readiness status is flapping.',
84102
},
85103
},
@@ -93,7 +111,9 @@
93111
severity: 'warning',
94112
},
95113
annotations: {
96-
description: 'The Kubelet Pod Lifecycle Event Generator has a 99th percentile duration of {{ $value }} seconds on node {{ $labels.node }}.',
114+
description: 'The Kubelet Pod Lifecycle Event Generator has a 99th percentile duration of {{ $value }} seconds on node {{ $labels.node }}%s.' % [
115+
utils.ifShowMultiCluster($._config, ' on cluster {{ $labels.%(clusterLabel)s }}' % $._config),
116+
],
97117
summary: 'Kubelet Pod Lifecycle Event Generator is taking too long to relist.',
98118
},
99119
},
@@ -107,7 +127,9 @@
107127
severity: 'warning',
108128
},
109129
annotations: {
110-
description: 'Kubelet Pod startup 99th percentile latency is {{ $value }} seconds on node {{ $labels.node }}.',
130+
description: 'Kubelet Pod startup 99th percentile latency is {{ $value }} seconds on node {{ $labels.node }}%s.' % [
131+
utils.ifShowMultiCluster($._config, ' on cluster {{ $labels.%(clusterLabel)s }}' % $._config),
132+
],
111133
summary: 'Kubelet Pod startup latency is too high.',
112134
},
113135
},
@@ -120,7 +142,9 @@
120142
severity: 'warning',
121143
},
122144
annotations: {
123-
description: 'Client certificate for Kubelet on node {{ $labels.node }} expires in {{ $value | humanizeDuration }}.',
145+
description: 'Client certificate for Kubelet on node {{ $labels.node }} expires in {{ $value | humanizeDuration }}%s.' % [
146+
utils.ifShowMultiCluster($._config, ' on cluster {{ $labels.%(clusterLabel)s }}' % $._config),
147+
],
124148
summary: 'Kubelet client certificate is about to expire.',
125149
},
126150
},
@@ -133,7 +157,9 @@
133157
severity: 'critical',
134158
},
135159
annotations: {
136-
description: 'Client certificate for Kubelet on node {{ $labels.node }} expires in {{ $value | humanizeDuration }}.',
160+
description: 'Client certificate for Kubelet on node {{ $labels.node }} expires in {{ $value | humanizeDuration }}%s.' % [
161+
utils.ifShowMultiCluster($._config, ' on cluster {{ $labels.%(clusterLabel)s }}' % $._config),
162+
],
137163
summary: 'Kubelet client certificate is about to expire.',
138164
},
139165
},
@@ -146,7 +172,9 @@
146172
severity: 'warning',
147173
},
148174
annotations: {
149-
description: 'Server certificate for Kubelet on node {{ $labels.node }} expires in {{ $value | humanizeDuration }}.',
175+
description: 'Server certificate for Kubelet on node {{ $labels.node }} expires in {{ $value | humanizeDuration }}%s.' % [
176+
utils.ifShowMultiCluster($._config, ' on cluster {{ $labels.%(clusterLabel)s }}' % $._config),
177+
],
150178
summary: 'Kubelet server certificate is about to expire.',
151179
},
152180
},
@@ -159,7 +187,9 @@
159187
severity: 'critical',
160188
},
161189
annotations: {
162-
description: 'Server certificate for Kubelet on node {{ $labels.node }} expires in {{ $value | humanizeDuration }}.',
190+
description: 'Server certificate for Kubelet on node {{ $labels.node }} expires in {{ $value | humanizeDuration }}%s.' % [
191+
utils.ifShowMultiCluster($._config, ' on cluster {{ $labels.%(clusterLabel)s }}' % $._config),
192+
],
163193
summary: 'Kubelet server certificate is about to expire.',
164194
},
165195
},
@@ -173,7 +203,9 @@
173203
},
174204
'for': '15m',
175205
annotations: {
176-
description: 'Kubelet on node {{ $labels.node }} has failed to renew its client certificate ({{ $value | humanize }} errors in the last 5 minutes).',
206+
description: 'Kubelet on node {{ $labels.node }} has failed to renew its client certificate ({{ $value | humanize }} errors in the last 5 minutes)%s.' % [
207+
utils.ifShowMultiCluster($._config, ' on cluster {{ $labels.%(clusterLabel)s }}' % $._config),
208+
],
177209
summary: 'Kubelet has failed to renew its client certificate.',
178210
},
179211
},
@@ -187,7 +219,9 @@
187219
},
188220
'for': '15m',
189221
annotations: {
190-
description: 'Kubelet on node {{ $labels.node }} has failed to renew its server certificate ({{ $value | humanize }} errors in the last 5 minutes).',
222+
description: 'Kubelet on node {{ $labels.node }} has failed to renew its server certificate ({{ $value | humanize }} errors in the last 5 minutes)%s.' % [
223+
utils.ifShowMultiCluster($._config, ' on cluster {{ $labels.%(clusterLabel)s }}' % $._config),
224+
],
191225
summary: 'Kubelet has failed to renew its server certificate.',
192226
},
193227
},

‎alerts/resource_alerts.libsonnet

+14-4
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
local utils = import '../lib/utils.libsonnet';
2+
13
{
24
_config+:: {
35
kubeStateMetricsSelector: error 'must provide selector for kube-state-metrics',
@@ -189,7 +191,9 @@
189191
severity: 'info',
190192
},
191193
annotations: {
192-
description: 'Namespace {{ $labels.namespace }} is using {{ $value | humanizePercentage }} of its {{ $labels.resource }} quota.',
194+
description: 'Namespace {{ $labels.namespace }} is using {{ $value | humanizePercentage }} of its {{ $labels.resource }} quota%s.' % [
195+
utils.ifShowMultiCluster($._config, ' on cluster {{ $labels.%(clusterLabel)s }}' % $._config),
196+
],
193197
summary: 'Namespace quota is going to be full.',
194198
},
195199
},
@@ -206,7 +210,9 @@
206210
severity: 'info',
207211
},
208212
annotations: {
209-
description: 'Namespace {{ $labels.namespace }} is using {{ $value | humanizePercentage }} of its {{ $labels.resource }} quota.',
213+
description: 'Namespace {{ $labels.namespace }} is using {{ $value | humanizePercentage }} of its {{ $labels.resource }} quota%s.' % [
214+
utils.ifShowMultiCluster($._config, ' on cluster {{ $labels.%(clusterLabel)s }}' % $._config),
215+
],
210216
summary: 'Namespace quota is fully used.',
211217
},
212218
},
@@ -223,7 +229,9 @@
223229
severity: 'warning',
224230
},
225231
annotations: {
226-
description: 'Namespace {{ $labels.namespace }} is using {{ $value | humanizePercentage }} of its {{ $labels.resource }} quota.',
232+
description: 'Namespace {{ $labels.namespace }} is using {{ $value | humanizePercentage }} of its {{ $labels.resource }} quota%s.' % [
233+
utils.ifShowMultiCluster($._config, ' on cluster {{ $labels.%(clusterLabel)s }}' % $._config),
234+
],
227235
summary: 'Namespace quota has exceeded the limits.',
228236
},
229237
},
@@ -240,7 +248,9 @@
240248
severity: 'info',
241249
},
242250
annotations: {
243-
description: '{{ $value | humanizePercentage }} throttling of CPU in namespace {{ $labels.namespace }} for container {{ $labels.container }} in pod {{ $labels.pod }}.',
251+
description: '{{ $value | humanizePercentage }} throttling of CPU in namespace {{ $labels.namespace }} for container {{ $labels.container }} in pod {{ $labels.pod }}%s.' % [
252+
utils.ifShowMultiCluster($._config, ' on cluster {{ $labels.%(clusterLabel)s }}' % $._config),
253+
],
244254
summary: 'Processes experience elevated CPU throttling.',
245255
},
246256
},

‎alerts/system_alerts.libsonnet

+8-2
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
local utils = import '../lib/utils.libsonnet';
2+
13
{
24
_config+:: {
35
notKubeDnsCoreDnsSelector: 'job!~"kube-dns|coredns"',
@@ -19,7 +21,9 @@
1921
severity: 'warning',
2022
},
2123
annotations: {
22-
description: 'There are {{ $value }} different semantic versions of Kubernetes components running.',
24+
description: 'There are {{ $value }} different semantic versions of Kubernetes components running%s.' % [
25+
utils.ifShowMultiCluster($._config, ' on cluster {{ $labels.%(clusterLabel)s }}' % $._config),
26+
],
2327
summary: 'Different semantic versions of Kubernetes components running.',
2428
},
2529
},
@@ -39,7 +43,9 @@
3943
severity: 'warning',
4044
},
4145
annotations: {
42-
description: "Kubernetes API server client '{{ $labels.job }}/{{ $labels.instance }}' is experiencing {{ $value | humanizePercentage }} errors.'",
46+
description: "Kubernetes API server client '{{ $labels.job }}/{{ $labels.instance }}' is experiencing {{ $value | humanizePercentage }} errors%s." % [
47+
utils.ifShowMultiCluster($._config, ' on cluster {{ $labels.%(clusterLabel)s }}' % $._config),
48+
],
4349
summary: 'Kubernetes API server client is experiencing errors.',
4450
},
4551
},

‎tests.yaml

+41
Original file line numberDiff line numberDiff line change
@@ -570,8 +570,49 @@ tests:
570570

571571
- interval: 1m
572572
input_series:
573+
# node=minikube is uncordoned so we expect the alert to fire
573574
- series: 'kube_node_status_condition{condition="Ready",endpoint="https-main",cluster="kubernetes",instance="10.0.2.15:10250",job="kube-state-metrics",namespace="monitoring",node="minikube",pod="kube-state-metrics-b894d84cc-d6htw",service="kube-state-metrics",status="true"}'
575+
values: '0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0'
576+
- series: 'kube_node_spec_unschedulable{endpoint="https-main",cluster="kubernetes",instance="10.0.2.15:10250",job="kube-state-metrics",namespace="monitoring",node="minikube",pod="kube-state-metrics-b894d84cc-d6htw",service="kube-state-metrics",status="true"}'
577+
values: '0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0'
578+
# node=minikube2 is cordoned so we expect the alert to not fire
579+
- series: 'kube_node_status_condition{condition="Ready",endpoint="https-main",cluster="kubernetes",instance="10.0.2.15:10250",job="kube-state-metrics",namespace="monitoring",node="minikube2",pod="kube-state-metrics-b894d84cc-f5e9f",service="kube-state-metrics"}'
580+
values: '0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0'
581+
- series: 'kube_node_spec_unschedulable{endpoint="https-main",cluster="kubernetes",instance="10.0.2.15:10250",job="kube-state-metrics",namespace="monitoring",node="minikube2",pod="kube-state-metrics-b894d84cc-f5e9f",service="kube-state-metrics"}'
582+
values: '1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1'
583+
alert_rule_test:
584+
- eval_time: 18m
585+
alertname: KubeNodeNotReady
586+
exp_alerts:
587+
- exp_labels:
588+
cluster: kubernetes
589+
node: minikube
590+
severity: warning
591+
condition: Ready
592+
endpoint: https-main
593+
instance: 10.0.2.15:10250
594+
job: kube-state-metrics
595+
namespace: monitoring
596+
pod: kube-state-metrics-b894d84cc-d6htw
597+
service: kube-state-metrics
598+
status: "true"
599+
exp_annotations:
600+
summary: "Node is not ready."
601+
description: 'minikube has been unready for more than 15 minutes.'
602+
runbook_url: 'https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubenodenotready'
603+
604+
- interval: 1m
605+
input_series:
606+
# node=minikube is uncordoned so we expect the alert to fire
607+
- series: 'kube_node_status_condition{condition="Ready",endpoint="https-main",cluster="kubernetes",instance="10.0.2.15:10250",job="kube-state-metrics",namespace="monitoring",node="minikube",pod="kube-state-metrics-b894d84cc-d6htw",service="kube-state-metrics",status="true"}'
608+
values: '1 0 1 0 1 0 0 0 1 0 1 0 0 0 1 0 1 0 0 1'
609+
- series: 'kube_node_spec_unschedulable{endpoint="https-main",cluster="kubernetes",instance="10.0.2.15:10250",job="kube-state-metrics",namespace="monitoring",node="minikube",pod="kube-state-metrics-b894d84cc-d6htw",service="kube-state-metrics",status="true"}'
610+
values: '0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0'
611+
# node=minikube2 is cordoned so we expect the alert to not fire
612+
- series: 'kube_node_status_condition{condition="Ready",endpoint="https-main",cluster="kubernetes",instance="10.0.2.15:10250",job="kube-state-metrics",namespace="monitoring",node="minikube2",pod="kube-state-metrics-b894d84cc-f5e9f",service="kube-state-metrics",status="true"}'
574613
values: '1 0 1 0 1 0 0 0 1 0 1 0 0 0 1 0 1 0 0 1'
614+
- series: 'kube_node_spec_unschedulable{endpoint="https-main",cluster="kubernetes",instance="10.0.2.15:10250",job="kube-state-metrics",namespace="monitoring",node="minikube2",pod="kube-state-metrics-b894d84cc-f5e9f",service="kube-state-metrics"}'
615+
values: '1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1'
575616
alert_rule_test:
576617
- eval_time: 18m
577618
alertname: KubeNodeReadinessFlapping

0 commit comments

Comments
 (0)
Please sign in to comment.