From 332320b2dad8451f42f7e906b2173649f3c77b96 Mon Sep 17 00:00:00 2001
From: Georg Pfuetzenreuter <mail@georg-pfuetzenreuter.net>
Date: Mar 24 2024 03:15:06 +0000
Subject: Improve disk space alerts


In an attempt to improve alerting accuracy and to be more lenient
with "low" disk space on machines with large partitions:

- keep current 90% critical alert for partitions with a total size
  of less than 94G
- add a 1% critical alert for partitions with a total size of more
  than 94G
- adjust future prediction to warn 12h in advance if 6h worth of
  data suggest free space might subceed zero

Signed-off-by: Georg Pfuetzenreuter <mail@georg-pfuetzenreuter.net>

---

diff --git a/salt/files/prometheus/alerts/base.yml b/salt/files/prometheus/alerts/base.yml
index 71d011b..13b23a4 100644
--- a/salt/files/prometheus/alerts/base.yml
+++ b/salt/files/prometheus/alerts/base.yml
@@ -27,6 +27,7 @@ groups:
           description: >-
             Failed to scrape node exporter on {{ $labels.instance }} for over 1 minute, host seems to be down.
 
+      # small partitions
       - alert: Low disk space
         expr: >-
           ((node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10
@@ -34,6 +35,8 @@ groups:
           ON (instance, device, mountpoint) node_filesystem_readonly == 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
           and
           ON (instance, device, mountpoint) node_filesystem_readonly == 0
+          and
+          ON (instance, device, mountpoint) node_filesystem_size_bytes < 100000000000
         for: 1m
         labels:
           severity: critical
@@ -45,11 +48,32 @@ groups:
             VALUE = {{ $value }}
             LABELS = {{ $labels }}
 
+      # big partitions
+      - alert: Low disk space
+        expr: >-
+          ((node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 1
+          and
+          ON (instance, device, mountpoint) node_filesystem_readonly == 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
+          and
+          ON (instance, device, mountpoint) node_filesystem_readonly == 0
+          and
+          ON (instance, device, mountpoint) node_filesystem_size_bytes >= 100000000000
+        for: 1m
+        labels:
+          severity: critical
+        annotations:
+          title: >-
+            Host {{ $labels.instance }} low on disk space
+          description: |
+            Filesystem mounted at {{ $labels.mountpoint }} on {{ $labels.instance }} is filled over 99%.
+            VALUE = {{ $value }}
+            LABELS = {{ $labels }}
+
       - alert: Low disk space predicted
         expr: >-
           ((node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10
           and
-          ON (instance, device, mountpoint) predict_linear(node_filesystem_avail_bytes{fstype!~"tmpfs"}[1h], 24 * 3600) < 0
+          ON (instance, device, mountpoint) predict_linear(node_filesystem_avail_bytes{fstype!~"tmpfs"}[6h], 43200) < 0
           and
           ON (instance, device, mountpoint) node_filesystem_readonly == 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
         for: 2m
@@ -59,7 +83,7 @@ groups:
           summary: >-
             Host {{ $labels.instance }} might run low on disk space
           description: |
-            Filesystem mounted at {{ $labels.mountpoint }} on {{ $labels.instance }} is predicted to run out of space within the next 24 hours at the current write rate
+            Based off data gathered in the last six hours, the filesystem mounted at {{ $labels.mountpoint }} on {{ $labels.instance }} is predicted to run out of space within the next twelve hours at the current write rate.
             VALUE = {{ $value }}
             LABELS = {{ $labels }}