Blob Blame History Raw
# yamllint disable rule:line-length
#####################################################
## MANAGED BY SALT in salt/files/prometheus/alerts ##
#####################################################
---
groups:
  - name: ioo-node_health
    rules:
      - alert: Node away
        expr: up{job="nodes"} == 0
        for: 0s
        labels:
          severity: info
        annotations:
          title: >-
            Host {{ $labels.instance }} down
          description: >-
            Node exporter on {{ $labels.instance }} became unreachable.
      - alert: Node down
        expr: up{job="nodes"} == 0
        for: 1m
        labels:
          severity: critical
        annotations:
          title: >-
            Host {{ $labels.instance }} down
          description: >-
            Failed to scrape node exporter on {{ $labels.instance }} for over 1 minute, host seems to be down.

      # small partitions
      - alert: Low disk space (small)
        expr: >-
          ((node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10
          and
          ON (instance, device, mountpoint) node_filesystem_readonly == 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
          and
          ON (instance, device, mountpoint) node_filesystem_readonly == 0
          and
          ON (instance, device, mountpoint) node_filesystem_size_bytes < 100000000000
        for: 1m
        labels:
          severity: critical
        annotations:
          title: >-
            Host {{ $labels.instance }} low on disk space
          description: |
            Filesystem mounted at {{ $labels.mountpoint }} on {{ $labels.instance }} is filled over 90%.
            VALUE = {{ $value }}
            LABELS = {{ $labels }}

      # big partitions
      - alert: Low disk space (big)
        expr: >-
          ((node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 1
          and
          ON (instance, device, mountpoint) node_filesystem_readonly == 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
          and
          ON (instance, device, mountpoint) node_filesystem_readonly == 0
          and
          ON (instance, device, mountpoint) node_filesystem_size_bytes >= 100000000000
        for: 1m
        labels:
          severity: critical
        annotations:
          title: >-
            Host {{ $labels.instance }} low on disk space
          description: |
            Filesystem mounted at {{ $labels.mountpoint }} on {{ $labels.instance }} is filled over 99%.
            VALUE = {{ $value }}
            LABELS = {{ $labels }}

      - alert: Low disk space predicted
        expr: >-
          ((node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10
          and
          ON (instance, device, mountpoint) predict_linear(node_filesystem_avail_bytes{fstype!~"tmpfs"}[6h], 43200) < 0
          and
          ON (instance, device, mountpoint) node_filesystem_readonly == 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
        for: 2m
        labels:
          severity: warning
        annotations:
          summary: >-
            Host {{ $labels.instance }} might run low on disk space
          description: |
            Based off data gathered in the last six hours, the filesystem mounted at {{ $labels.mountpoint }} on {{ $labels.instance }} is predicted to run out of space within the next twelve hours at the current write rate.
            VALUE = {{ $value }}
            LABELS = {{ $labels }}

      - alert: Low memory
        expr: >-
          (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 10) * on(instance)
          group_left (nodename) node_uname_info{nodename=~".+"}
        for: 1m
        labels:
          severity: warning
        annotations:
          summary: >-
            Host {{ $labels.instance }} running out of memory
          description: |
            Memory is filling up on {{ $labels.instance }} (< 10% left).
            VALUE = {{ $value }}
            LABELS = {{ $labels }}

      - alert: High CPU load
        expr: >-
          (sum by (instance) (avg by (mode, instance) (rate(node_cpu_seconds_total{mode!="idle"}[2m]))) > 0.8) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: >-
            Host {{ $labels.instance }} high on CPU usage
          description: |
            CPU load on {{ $labels.instance }} is > 80%
            VALUE = {{ $value }}
            LABELS = {{ $labels }}