From 332320b2dad8451f42f7e906b2173649f3c77b96 Mon Sep 17 00:00:00 2001 From: Georg Pfuetzenreuter Date: Mar 24 2024 03:15:06 +0000 Subject: Improve disk space alerts In an attempt to improve alerting accuracy and to be more lenient with "low" disk space on machines with large partitions: - keep current 90% critical alert for partitions with a total size of less than 94G - add a 1% critical alert for partitions with a total size of more than 94G - adjust future prediction to warn 12h in advance if 6h worth of data suggest free space might subceed zero Signed-off-by: Georg Pfuetzenreuter --- diff --git a/salt/files/prometheus/alerts/base.yml b/salt/files/prometheus/alerts/base.yml index 71d011b..13b23a4 100644 --- a/salt/files/prometheus/alerts/base.yml +++ b/salt/files/prometheus/alerts/base.yml @@ -27,6 +27,7 @@ groups: description: >- Failed to scrape node exporter on {{ $labels.instance }} for over 1 minute, host seems to be down. + # small partitions - alert: Low disk space expr: >- ((node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 @@ -34,6 +35,8 @@ groups: ON (instance, device, mountpoint) node_filesystem_readonly == 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"} and ON (instance, device, mountpoint) node_filesystem_readonly == 0 + and + ON (instance, device, mountpoint) node_filesystem_size_bytes < 100000000000 for: 1m labels: severity: critical @@ -45,11 +48,32 @@ groups: VALUE = {{ $value }} LABELS = {{ $labels }} + # big partitions + - alert: Low disk space + expr: >- + ((node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 1 + and + ON (instance, device, mountpoint) node_filesystem_readonly == 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"} + and + ON (instance, device, mountpoint) node_filesystem_readonly == 0 + and + ON (instance, device, mountpoint) node_filesystem_size_bytes >= 100000000000 + for: 1m + labels: + severity: critical + annotations: + title: >- + Host {{ $labels.instance }} low on disk space + description: | + Filesystem mounted at {{ $labels.mountpoint }} on {{ $labels.instance }} is filled over 99%. + VALUE = {{ $value }} + LABELS = {{ $labels }} + - alert: Low disk space predicted expr: >- ((node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and - ON (instance, device, mountpoint) predict_linear(node_filesystem_avail_bytes{fstype!~"tmpfs"}[1h], 24 * 3600) < 0 + ON (instance, device, mountpoint) predict_linear(node_filesystem_avail_bytes{fstype!~"tmpfs"}[6h], 43200) < 0 and ON (instance, device, mountpoint) node_filesystem_readonly == 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"} for: 2m @@ -59,7 +83,7 @@ groups: summary: >- Host {{ $labels.instance }} might run low on disk space description: | - Filesystem mounted at {{ $labels.mountpoint }} on {{ $labels.instance }} is predicted to run out of space within the next 24 hours at the current write rate + Based off data gathered in the last six hours, the filesystem mounted at {{ $labels.mountpoint }} on {{ $labels.instance }} is predicted to run out of space within the next twelve hours at the current write rate. VALUE = {{ $value }} LABELS = {{ $labels }}