Hello, I have problem that alertmanager resolves pagerduty alerts with 3 min delay. Alerts themselves come to PD exactly on time where fired.
as for config I don't use any grouping on alertmanager side. For resolve I set only "send_resolved: true"
alertmanagerFiles:
alertmanager.yml:
global:
slack_api_url: <slack_uri>
receivers:
- name: pagerduty_p4_component1
pagerduty_configs:
- description: |-
{{ range .Alerts }}
*Alert:* {{ .Annotations.title }}
{{ end }}
send_resolved: true
severity: |-
{{ range .Alerts }}
{{ .Labels.severity }}
{{ end }}
service_key: <pd_key>
- name: pagerduty_p3_component1
pagerduty_configs:
- description: |-
{{ range .Alerts }}
*Alert:* {{ .Annotations.title }}
{{ end }}
send_resolved: true
severity: |-
{{ range .Alerts }}
{{ .Labels.severity }}
{{ end }}
service_key: <pd_key>
route:
receiver: slack
routes:
- receiver: 'pagerduty_p4_component1'
match:
priority: P4
component: component1
- receiver: 'pagerduty_p3_component1'
match:
priority: P3
component: component1
serverFiles:
alerts:
groups:
- name: component1_ErrorRate_P4_Minor_code_5XX
rules:
- alert: component1_ErrorRate_P4_Minor_code_5XX
expr: (sum by (job) (increase(nginx_ingress_controller_request_duration_seconds_count{ingress=~"component1", status=~"5.*"}[2m]))) / (sum by (job) (increase(nginx_ingress_controller_request_duration_seconds_count{ingress=~"component2|component1"}[2m]))) > 0.01
labels:
priority: P4
severity: Minor
error_code: 5XX
component: component1
annotations:
title: "component1 Error rate P4/Minor - Error Code: 5XX"
summary: "component1 Error rate with code 5XX is above 1%"
- name: component1_ErrorRate_P3_Major_code_5XX
rules:
- alert: component1_ErrorRate_P3_Major_code_5XX
expr: (sum by (job) (increase(nginx_ingress_controller_request_duration_seconds_count{ingress=~"component1", status=~"5.*"}[2m]))) / (sum by (job) (increase(nginx_ingress_controller_request_duration_seconds_count{ingress=~"component2|component1"}[2m]))) > 0.02
labels:
priority: P3
severity: Major
error_code: 5XX
component: component1
annotations:
title: "component1 Error rate P3/Major - Error Code: 5XX"
summary: "component1 Error rate with code 5XX is above 2%"