# yamllint disable rule:line-length
#####################################################
## MANAGED BY SALT in salt/files/prometheus/alerts ##
#####################################################
---
groups:
- name: ioo-node_health
rules:
- alert: Node away
expr: up{job="nodes"} == 0
for: 0s
labels:
severity: info
annotations:
title: >-
Host {{ $labels.instance }} down
description: >-
Node exporter on {{ $labels.instance }} became unreachable.
- alert: Node down
expr: up{job="nodes"} == 0
for: 1m
labels:
severity: critical
annotations:
title: >-
Host {{ $labels.instance }} down
description: >-
Failed to scrape node exporter on {{ $labels.instance }} for over 1 minute, host seems to be down.
# small partitions
- alert: Low disk space (small)
expr: >-
((node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10
and
ON (instance, device, mountpoint) node_filesystem_readonly == 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
and
ON (instance, device, mountpoint) node_filesystem_readonly == 0
and
ON (instance, device, mountpoint) node_filesystem_size_bytes < 100000000000
for: 1m
labels:
severity: critical
annotations:
title: >-
Host {{ $labels.instance }} low on disk space
description: |
Filesystem mounted at {{ $labels.mountpoint }} on {{ $labels.instance }} is filled over 90%.
VALUE = {{ $value }}
LABELS = {{ $labels }}
# big partitions
- alert: Low disk space (big)
expr: >-
((node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 1
and
ON (instance, device, mountpoint) node_filesystem_readonly == 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
and
ON (instance, device, mountpoint) node_filesystem_readonly == 0
and
ON (instance, device, mountpoint) node_filesystem_size_bytes >= 100000000000
for: 1m
labels:
severity: critical
annotations:
title: >-
Host {{ $labels.instance }} low on disk space
description: |
Filesystem mounted at {{ $labels.mountpoint }} on {{ $labels.instance }} is filled over 99%.
VALUE = {{ $value }}
LABELS = {{ $labels }}
- alert: Low disk space predicted
expr: >-
((node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10
and
ON (instance, device, mountpoint) predict_linear(node_filesystem_avail_bytes{fstype!~"tmpfs"}[6h], 43200) < 0
and
ON (instance, device, mountpoint) node_filesystem_readonly == 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
for: 2m
labels:
severity: warning
annotations:
summary: >-
Host {{ $labels.instance }} might run low on disk space
description: |
Based off data gathered in the last six hours, the filesystem mounted at {{ $labels.mountpoint }} on {{ $labels.instance }} is predicted to run out of space within the next twelve hours at the current write rate.
VALUE = {{ $value }}
LABELS = {{ $labels }}
- alert: Low memory
expr: >-
(node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 10) * on(instance)
group_left (nodename) node_uname_info{nodename=~".+"}
for: 1m
labels:
severity: warning
annotations:
summary: >-
Host {{ $labels.instance }} running out of memory
description: |
Memory is filling up on {{ $labels.instance }} (< 10% left).
VALUE = {{ $value }}
LABELS = {{ $labels }}
- alert: High CPU load
expr: >-
(sum by (instance) (avg by (mode, instance) (rate(node_cpu_seconds_total{mode!="idle"}[2m]))) > 0.8) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
for: 5m
labels:
severity: warning
annotations:
summary: >-
Host {{ $labels.instance }} high on CPU usage
description: |
CPU load on {{ $labels.instance }} is > 80%
VALUE = {{ $value }}
LABELS = {{ $labels }}