Hello
Here is my alert rules for node_exporter
Note that
node_memory_MemAvailable_bytes metric is absent for old kernels(you need additional rule to calc it if it is absent)
groups:
- name: 'Node exporter alerts'
rules:
- alert: 'Node_exporter_service'
#expr: up{job="node_exporter"} == 0
expr: avg_over_time(up{job="node_exporter",noAlarmOn!~"(.*,|^)node(,.*|$)"}[15m]) < 0.8
for: 15m
labels:
severity: 'critical'
annotations:
title: "Node Exporter seems down"
description: 'Node exporter does not return any metrics!'
- name: 'Node exporter Hardware alerts'
rules:
- alert: 'MDRAID degraded'
expr: (node_md_disks{job="node_exporter", noAlarmOn!~"(.*,|^)raid(,.*|$)" } - node_md_disks_active{job="node_exporter"}) != 0
for: 1m
labels:
severity: 'warning'
annotations:
title: "MDRAID status"
description: '{{ if ne ($value | humanize) "0" }} Degraded {{ else }} Healthy {{ end }} RAID array {{ $labels.device }} on {{ $labels.instance }}.
{{ if ne ($value | humanize) "0" }} some disks failed. {{ end }}
node_md_disks{job="{{ $labels.job }}",instance="{{ $labels.instance }}"} - node_md_disks_active{job="{{ $labels.job }}",instance="{{ $labels.instance }}"}) = {{$value | humanize}}'
- alert: "Bond degraded"
expr: (node_bonding_active{ noAlarmOn!~"(.*,|^)bond(,.*|$)" } - node_bonding_slaves{job="node_exporter"}) != 0
for: 1m
labels:
severity: 'warning'
annotations:
title: "Bonding status"
description: 'Bond {{ $labels.master }} is {{ if ne ($value | humanize) "0" }} degraded {{ else }} healthy {{ end }} on {{ $labels.instance }}.
node_bonding_active - node_bonding_slaves = {{ $value | humanize }}'
- name: "Node exporter OS alerts"
rules:
- alert: "CPU Load linux"
expr: node_load15{job="node_exporter", noAlarmOn!~"(.*,|^)cpu(,.*|$)" } / (count without (cpu, mode) (node_cpu_seconds_total{job="node_exporter", mode="system"})) > 1.5
for: 30m
labels:
severity: 'critical'
annotations:
title: "Avg load15m > 1.5*cpu"
description: 'Avg load15m is more than cpu count in {{ $value | printf "%2.3f" }} times for {{ $labels.ip }} ({{ $labels.host }})'
- alert: "CPU Load linux"
expr: node_load15{job="node_exporter", noAlarmOn!~"(.*,|^)cpu(,.*|$)" } / (count without (cpu, mode) (node_cpu_seconds_total{job="node_exporter", mode="system"})) > 1.2
for: 30m
labels:
severity: 'warning'
annotations:
title: "Avg load15m > 1.2*cpu"
description: 'Avg load15m is more than cpu count in {{ $value | printf "%2.3f" }} times for {{ $labels.ip }} ({{ $labels.host }})'
- alert: "DiskSpaceUsage"
expr: (node_filesystem_free_bytes{job="node_exporter", fstype=~"(ext.|xfs|zfs)", noAlarmOn!~"(.*,|^)space(,.*|$)" } / node_filesystem_size_bytes{job="node_exporter",fstype=~"(ext.|xfs|zfs)"} * 100) < 7
for: 1m
labels:
severity: 'critical'
annotations:
title: "Disk Space Usage"
description: 'On {{ $labels.ip }} device {{ $labels.device }} mounted on {{ $labels.mountpoint }} has free space of {{ $value | printf "%2.3f" }}%'
- alert: "DiskSpaceUsage"
expr: (node_filesystem_free_bytes{job="node_exporter", fstype=~"(ext.|xfs|zfs)", noAlarmOn!~"(.*,|^)space(,.*|$)" } / node_filesystem_size_bytes{job="node_exporter",fstype=~"(ext.|xfs|zfs)"} * 100) < 10
for: 10m
labels:
severity: 'warning'
annotations:
title: "Disk Space Usage"
description: 'On {{ $labels.ip }} device {{ $labels.device }} mounted on {{ $labels.mountpoint }} has free space of {{ $value | printf "%2.3f" }}%'
- alert: "DiskSpaceUsage"
expr: predict_linear(node_filesystem_free_bytes{job="node_exporter", fstype=~"(ext.|xfs|zfs)", noAlarmOn!~"(.*,|^)predict(,.*|$)", noAlarmOn!~"(.*,|^)space(,.*|$)",owner!="platforms"}[1h], 4*3600) < 0
for: 5m
labels:
severity: 'predict'
annotations:
title: "Disk Space Usage on {{ $labels.ip }} predict for next 4 hours"
description: "On {{ $labels.ip }} device {{ $labels.device }} mounted on {{ $labels.mountpoint }} will predict have space of {{ $value | humanize1024 }}bytes in 4 hours."
- alert: "Inode Usage"
expr: ( ( node_filesystem_files_free{job="node_exporter", fstype=~"(ext.|xfs)"} / node_filesystem_files{job="node_exporter", fstype=~"(ext.|xfs)", noAlarmOn!~"(.*,|^)space(,.*|$)"} ) * 100 ) <= 20
for: 15m
labels:
severity: 'warning'
annotations:
title: "Inode usage"
description: 'Inode usage on {{ $labels.ip }} device {{ $labels.device }} mounted as {{ $labels.mountpoint }} is {{ $value | printf "%2.3f" }}% left.
{{ if le $value 20.0 }} Remove unused files, investigate the causes of their appearance. Run "find {{ $labels.mountpoint }} -mount" {{end}} '
- alert: "MemAvailable"
expr: predict_linear(node_memory_MemAvailable_bytes{job="node_exporter", noAlarmOn!~"(.*,|^)memory(,.*|$)",owner!~"(test|^)platforms"}[2h], 1*3600) <= 0
for: 15m
labels:
severity: 'predict'
annotations:
title: "No Memory Available predict for next hour"
description: 'On {{ $labels.ip }} predict to 1Hour No Memory Available in 1 Hour'
- alert: "MemAvailable"
expr: (node_memory_MemAvailable_bytes{job="node_exporter", noAlarmOn!~"(.*,|^)memory(,.*|$)" } / node_memory_MemTotal_bytes{job="node_exporter"} * 100) < 5 and rate(node_vmstat_pgmajfault{job="node_exporter"}[3m]) > 100
for: 5m
labels:
severity: 'critical'
annotations:
title: "Memory Available"
description: "{{ with printf \"node_memory_MemAvailable_bytes{instance='%s',job='%s'}\" .Labels.instance .Labels.job | query }}{{ . | first | value | humanize1024 }}Bytes Available,{{end}}
{{ with printf \"node_memory_MemTotal_bytes{instance='%s',job='%s'}\" .Labels.instance .Labels.job | query }}{{ . | first | value | humanize1024 }}Bytes Total{{end}}
( {{ $value | humanize }}% Available)
{{ with printf \"rate(node_vmstat_pgmajfault{instance='%s',job='%s'}[3m])\" .Labels.instance .Labels.job | query }}, scanrate is {{ . | first | value | printf \"%2.4f\" }} pgmajfaults/sec{{end}}"
- alert: "SwapUsed"
expr: 100 * ( node_memory_SwapTotal_bytes{job="node_exporter", noAlarmOn!~"(.*,|^)memory(,.*|$)"} - node_memory_SwapFree_bytes{} - node_memory_SwapCached_bytes{} ) / node_memory_SwapTotal_bytes{} > 75
for: 5m
labels:
severity: 'warning'
annotations:
title: "Swap Used more than 75%"
description: 'Swap is used for {{ $value | printf "%2.2f" }}%'
- alert: "DiskIO_read_Time"
expr: rate(node_disk_read_time_seconds_total{noAlarmOn!~"(.*,|^)diskwait(,.*|$)"}[10m]) / rate(node_disk_reads_completed_total[10m]) * 1000 > 300
for: 1h
labels:
severity: 'warning'
annotations:
title: "Disk reads wait time is large >300ms"
description: 'On {{ $labels.ip }} device {{ $labels.device }} has disk wait time {{ $value | printf "%1.1f" }}ms.'
- alert: "DiskIO_write_Time"
expr: rate(node_disk_write_time_seconds_total{noAlarmOn!~"(.*,|^)diskwait(,.*|$)"}[10m]) / rate(node_disk_writes_completed_total[10m]) * 1000 > 300
for: 1h
labels:
severity: 'warning'
annotations:
title: "Disk writes wait time is large >300ms"
description: 'On {{ $labels.ip }} device {{ $labels.device }} has disk wait time {{ $value | printf "%1.1f" }}ms.'
- alert: "DiskIO%b"
expr: rate(node_disk_io_time_seconds_total[30m])*100 > 95
for: 10h
labels:
severity: 'warning'
annotations:
title: "Disk %b is large for 10 hours"
description: 'On {{ $labels.ip }} device {{ $labels.device }} has {{ $value | printf "%2.3f" }}%busy time.'
- alert: "MemAvailable"
expr: (node_memory_MemAvailable_bytes{job="node_exporter", noAlarmOn!~"(.*,|^)memory(,.*|$)" } / node_memory_MemTotal_bytes{job="node_exporter"} * 100) < 5
for: 15m
labels:
severity: 'warning'
annotations:
title: "Memory Available"
description: "{{ with printf \"node_memory_MemAvailable_bytes{instance='%s',job='%s'}\" .Labels.instance .Labels.job | query }}{{ . | first | value | humanize1024 }}Bytes Available,{{end}}
{{ with printf \"node_memory_MemTotal_bytes{instance='%s',job='%s'}\" .Labels.instance .Labels.job | query }}{{ . | first | value | humanize1024 }}Bytes Total{{end}}
( {{ $value | humanize }}% Available)
{{ with printf \"rate(node_vmstat_pgmajfault{instance='%s',job='%s'}[3m])\" .Labels.instance .Labels.job | query }}, scanrate is {{ . | first | value | printf \"%2.4f\" }} pgmajfaults/sec{{end}}"
- alert: "ProcessNearFDLimits"
expr: process_open_fds{job="node_exporter"} / process_max_fds{job="node_exporter"} > 0.8
for: 10m
labels:
severity: 'warning'
annotations:
title: "Process File descriptors."
description: "On {{ $labels.ip }} process_open_fds / process_max_fds is about {{ $value | humanize }}%"
- alert: "ProcessNearFDLimits"
expr: process_open_fds{job="node_exporter"} / process_max_fds{job="node_exporter"} > 0.9
for: 10m
labels:
severity: 'critical'
annotations:
title: "Process File descriptors."
description: "On {{ $labels.ip }} process_open_fds / process_max_fds is about {{ $value | humanize }}%"
- alert: "Uptime"
expr: time() - node_boot_time_seconds{job="node_exporter"} < 3600
for: 1m
labels:
severity: warning
annotations:
title: "System rebooted"
description: "Host {{ $labels.instance }} uptime is less than 1 hour. \n VALUE = {{ $value | humanizeDuration }} seconds"
- alert: "NetworkTransmitDrops"
expr: 100 * irate(node_network_transmit_drop_total{device!~'tap.*|veth.*|br.*|docker.*|virbr*|lo*'}[30m]) /
irate(node_network_transmit_packets_total{device!~'tap.*|veth.*|br.*|docker.*|virbr*|lo*'}[30m]) > 0
for: 1m
labels:
severity: warning
annotations:
title: "Network Transmit Drops"
description: 'Host {{ $labels.instance }} drops {{ $value | printf "%2.6f" }}% of transmit packets on NIC {{ $labels.device }}'
- alert: "NetworkReceiveErrs"
expr: 100 * irate(node_network_receive_errs_total{device!~'tap.*|veth.*|br.*|docker.*|virbr*|lo*'}[30m]) /
irate(node_network_receive_packets_total{device!~'tap.*|veth.*|br.*|docker.*|virbr*|lo*'}[30m]) > 0
for: 1m
labels:
severity: warning
annotations:
title: "Network Receive Errs"
description: 'Host {{ $labels.instance }} have errors in {{ $value | printf "%2.6f" }}% of received packets on NIC {{ $labels.device }}'
- alert: "NetworkTranmitColls"
expr: 100 * irate(node_network_transmit_colls_total{device!~'tap.*|veth.*|br.*|docker.*|virbr*|lo*'}[30m]) /
(irate(node_network_transmit_colls_total{device!~'tap.*|veth.*|br.*|docker.*|virbr*|lo*'}[30m]) +
irate(node_network_transmit_packets_total{device!~'tap.*|veth.*|br.*|docker.*|virbr*|lo*'}[30m])) > 0
for: 1m
labels:
severity: warning
annotations:
title: "Network Tranmit Collisions"
description: 'Host {{ $labels.instance }} have collisions in {{ $value | printf "%2.6f" }}% of tried to tranmit packets on NIC {{ $labels.device }}'
- alert: "Localtime"
expr: abs(node_time_seconds{job="node_exporter"} - timestamp(node_time_seconds{job="node_exporter"})) > 10
for: 10m
labels:
severity: warning
annotations:
title: "Time is wrong"
description: "Localtime of {{ $labels.instance }} differs from Prometheus time for more than 10 second (value {{ $value | humanize }} sec). Check Time Sync."