I've configured the below to generation alerts for pod restart metrics. The group_wait and group_interval values work fine, however,
the repeat_interval is not. I've tried an number of configurations without any success. Appreciate any recommendations.
global:
scrape_interval: 15s
evaluation_interval: 15s
alertmanager.yml file
route:
receiver: test-email
# All alerts that do not match the following child routes
# will remain at the root node and be dispatched to 'default-receiver'.
routes:
# All alerts with severity=pod-restart label match this sub-route.
# They are grouped by pod and namespace
- receiver: pod-restart
group_by: [alertname, pod, namespace]
group_wait: 30s
group_interval: 40m
repeat_interval: 3h
- match_re:
severity: pod-restart
# All alerts with severity=pod-critical label match this sub-route.
# They are grouped by pod and namespace
- receiver: pod-critical
group_by: [alertname, pod, namespace]
group_wait: 30s
group_interval: 35m
repeat_interval: 3h
- match_re:
severity: pod-critical
- name: Pod Restart
rules:
- alert: Pod Restart
expr: rate(kube_pod_container_status_restarts_total[5m]) * 300 > 0
for: 2m
labels:
severity: pod-restart
annotations:
description: 'The {{$labels.pod}} Pod running in Namespace {{$labels.namespace}} located in Container {{$labels.container}} has restarted in the previous 5 minutes.'
summary: 'Container {{$labels.container}} in Pod {{$labels.namespace}}/{{$labels.pod}} has restarted in the previous 5 minutes.'
# ---------------------------------------
Monitoring for Pod ErrImagePull Error
- name: Pod ErrImagePull Error
rules:
- alert: Pod ErrImagePull Error
expr: kube_pod_container_status_waiting_reason{reason=~"ErrImagePull|ImagePullBackOff"} > 0
for: 2m
labels:
severity: pod-critical
annotations:
description: 'The {{$labels.pod}} Pod running in Namespace {{$labels.namespace}} located in Container {{$labels.container}} has failed due to a {{$labels.reason}} error in the previous 5 minutes.'
summary: 'Container {{$labels.container}} in Pod {{$labels.namespace}}/{{$labels.pod}} has failed due to a {{$labels.reason}} error in the previous 5 minutes.'