From 4272dc0cecdf0690373e05f34afc9aa299c8259d Mon Sep 17 00:00:00 2001
From: Christian Boltz <cboltz@opensuse.org>
Date: Apr 14 2024 19:32:26 +0000
Subject: Merge branch 'crameleon/haproxy-alerts' into 'production'


Configure HAProxy alerts

See merge request infra/salt!1590
---

diff --git a/salt/files/prometheus/alerts/haproxy.yml b/salt/files/prometheus/alerts/haproxy.yml
new file mode 100644
index 0000000..dda8010
--- /dev/null
+++ b/salt/files/prometheus/alerts/haproxy.yml
@@ -0,0 +1,241 @@
+# yamllint disable rule:line-length
+#####################################################
+## MANAGED BY SALT in salt/files/prometheus/alerts ##
+#####################################################
+---
+groups:
+  - name: ioo-haproxy
+    rules:
+
+      - alert: HAProxyHighHttp4xxErrorRateBackend
+        annotations:
+          description: |
+            Too many HTTP requests with status 4xx (> 5%) for backend {{ $labels.proxy }} on {{ $labels.instance }}
+            VALUE = {{ $value }}
+            LABELS = {{ $labels }}
+          summary: HAProxy high HTTP 4xx error rate backend ({{ $labels.instance }}/{{ $labels.proxy }})
+        expr: >-
+          (
+            (
+              sum by (instance, proxy) (rate(haproxy_server_http_responses_total{code="4xx"}[5m]))
+              /
+              sum by (instance, proxy) (rate(haproxy_server_http_responses_total[5m]))
+              * 100
+            )
+            > 5
+          )
+          and on (instance, proxy) (rate(haproxy_server_http_responses_total{code="4xx"}[5m]) * 60)
+            > 10
+        for: 1m
+        labels:
+          severity: info
+      - alert: HAProxyHighHttp5xxErrorRateBackend
+        annotations:
+          description: |
+            Too many HTTP requests with status 5xx (> 5%) for backend {{ $labels.proxy }} on {{ $labels.instance }}
+            VALUE = {{ $value }}
+            LABELS = {{ $labels }}
+          summary: HAProxy high HTTP 5xx error rate backend ({{ $labels.instance }}/{{ $labels.proxy }})
+        expr: >-
+          (
+          sum by (instance, proxy) (rate(haproxy_server_http_responses_total{code="5xx"}[1m]))
+          /
+          sum by (instance, proxy) (rate(haproxy_server_http_responses_total[1m]))
+          * 100
+          )
+          > 5
+        for: 1m
+        labels:
+          # should be warning, but there are existing problematic backends :-(
+          severity: info
+
+      # ErrorRateServer rules are only engaged for backends with more than one server to avoid duplicating the above
+      - alert: HAProxyHighHttp4xxErrorRateServer
+        annotations:
+          description: |
+            Too many HTTP requests with status 4xx (> 5%) for server {{ $labels.server }} on {{ $labels.instance }}.
+            VALUE = {{ $value }}
+            LABELS = {{ $labels }}
+          summary: HAProxy high HTTP 4xx error rate server ({{ $labels.instance }}/{{ $labels.server }})
+        expr: >-
+          (
+            (
+              sum by (instance, proxy, server) (rate(haproxy_server_http_responses_total{code="4xx"}[5m]))
+              /
+              sum by (instance, proxy, server) (rate(haproxy_server_http_responses_total[5m]))
+              * 100
+            )
+            > 5
+          )
+          and on (instance, proxy, server) (rate(haproxy_server_http_responses_total{code="4xx"}[5m]) * 60)
+            > 10
+          and on(instance, proxy) haproxy_backend_active_servers
+            > 1
+        for: 1m
+        labels:
+          severity: info
+      - alert: HAProxyHighHttp5xxErrorRateServer
+        annotations:
+          description: |
+            Too many HTTP requests with status 5xx (> 5%) for server {{ $labels.server }} on {{ $labels.instance }}.
+            VALUE = {{ $value }}
+            LABELS = {{ $labels }}
+          summary: HAProxy high HTTP 5xx error rate server ({{ $labels.instance }}/{{ $labels.server }})
+        expr: >-
+          (
+          sum by (instance, proxy, server) (rate(haproxy_server_http_responses_total{code="5xx"}[1m]) * 100)
+          /
+          sum by (instance, proxy, server) (rate(haproxy_server_http_responses_total[1m]))
+          and on(instance, proxy) haproxy_backend_active_servers > 1
+          )
+          > 5
+        for: 1m
+        labels:
+          # should be warning, but there are existing problematic backends :-(
+          severity: info
+
+      - alert: HAProxyServerResponseErrors
+        annotations:
+          description: |
+            Too many response errors to server {{ $labels.server }} on {{ $labels.instance }} (> 5%).
+            VALUE = {{ $value }}
+            LABELS = {{ $labels }}
+          summary: HAProxy server response errors ({{ $labels.instance }}/{{ $labels.server }})
+        expr: >-
+          sum by (instance, server) (rate(haproxy_server_response_errors_total[1m]) * 100)
+          /
+          sum by (instance, server) (rate(haproxy_server_http_responses_total[1m]))
+          > 5
+        for: 1m
+        labels:
+          severity: warning
+
+      # filter out backends with no servers because ones serving only errorfiles (for example conncheck) seem to report bogus connection errors
+      - alert: HAProxyBackendConnectionErrors
+        annotations:
+          description: |
+            Too many connection errors to backend {{ $labels.proxy }} from {{ $labels.instance }} (> 100 req/s). Request throughput may be too high.
+            VALUE = {{ $value }}
+            LABELS = {{ $labels }}
+          summary: HAProxy backend connection errors ({{ $labels.instance }}/{{ $labels.proxy }})
+        expr: >-
+          (
+          sum by (instance, proxy) (rate(haproxy_backend_connection_errors_total[1m]))
+          and on(instance, proxy) haproxy_backend_active_servers > 0
+          )
+          > 100
+        for: 1m
+        labels:
+          severity: critical
+
+      # ServerConnectionErrors rule is only engaged for backends with more than one server to avoid duplicating the above
+      - alert: HAProxyServerConnectionErrors
+        annotations:
+          description: |
+            Too many connection errors to server {{ $labels.server }} from {{ $labels.instance }} (> 100 req/s). Request throughput may be too high.
+            VALUE = {{ $value }}
+            LABELS = {{ $labels }}
+          summary: HAProxy server connection errors ({{ $labels.instance }}/{{ $labels.proxy }}/{{ $labels.server }})
+        expr: >-
+          (
+          sum by (instance, proxy, server) (rate(haproxy_server_connection_errors_total[1m]))
+          and on(instance, proxy) haproxy_backend_active_servers > 1
+          )
+          > 100
+        for: 0m
+        labels:
+          severity: critical
+
+      - alert: HAProxyBackendMaxActiveSession
+        annotations:
+          description: |
+            HAProxy backend {{ $labels.proxy }} on {{ $labels.instance }} is reaching the session limit (> 80%).
+            VALUE = {{ $value }}
+            LABELS = {{ $labels }}
+          summary: HAProxy backend max active session ({{ $labels.instance }}/{{ $labels.proxy }})
+        expr: >-
+          (
+          sum by (proxy) (avg_over_time(haproxy_backend_max_sessions[2m]) * 100)
+          /
+          sum by (proxy) (avg_over_time(haproxy_backend_limit_sessions[2m]))
+          )
+          > 80
+        for: 2m
+        labels:
+          severity: warning
+
+      - alert: HAProxyPendingRequests
+        annotations:
+          description: |
+            Some proxy requests are pending on backend {{ $labels.proxy }} on {{ $labels.instance }}.
+            VALUE = {{ $value }}
+            LABELS = {{ $labels }}
+          summary: HAProxy pending requests ({{ $labels.instance }}/{{ $labels.proxy }})
+        expr: >-
+          sum by (proxy) (haproxy_backend_current_queue)
+          > 0
+        for: 2m
+        labels:
+          severity: warning
+
+      # I do not fully understand this metric, dividing by 1024 feels wrong, but the HELP text says:
+      # "haproxy_backend_total_time_average_seconds Avg. total time for last 1024 successful connections"
+      # https://github.com/haproxy/haproxy/issues/2317
+      - alert: HAProxyBackendSlowingDown
+        annotations:
+          description: |
+            Average request time is increasing for backend {{ $labels.proxy }} on {{ $labels.instance }}.
+            VALUE = {{ $value }}
+            LABELS = {{ $labels }}
+          summary: HAProxy backend slowing down ({{ $labels.instance }}/{{ $labels.proxy }})
+        expr: >-
+          avg by (proxy)
+          (
+          haproxy_backend_total_time_average_seconds / 1024
+          )
+          > 1
+        for: 1m
+        labels:
+          severity: warning
+
+      - alert: HAProxyRetryHigh
+        annotations:
+          description: |
+            High rate of retry on backend {{ $labels.proxy }} on {{ $labels.instance }}.
+            VALUE = {{ $value }}
+            LABELS = {{ $labels }}
+          summary: HAProxy retry high ({{ $labels.instance }}/{{ $labels.proxy }})
+        expr: >-
+          sum by (proxy) (rate(haproxy_backend_retry_warnings_total[1m]))
+          > 3
+        for: 2m
+        labels:
+          severity: warning
+
+      - alert: HAProxyBackendServerDown
+        annotations:
+          description: |
+            HAProxy server {{ $labels.server }} in backend {{ $labels.proxy }} on {{ $labels.instance }} is down.
+            VALUE = {{ $value }}
+            LABELS = {{ $labels }}
+          summary: HAProxy server down ({{ $labels.instance }}/{{ $labels.proxy }}/{{ $labels.server }})
+        expr: >-
+          haproxy_server_status{state="DOWN"}
+          > 0
+        for: 30s
+        labels:
+          severity: warning
+
+      - alert: HAProxyFailedHealthChecks
+        annotations:
+          description: |
+            HAProxy health checks for server {{ $labels.server }} in backend {{ $labels.proxy }} on {{ $labels.instance }} are failing.
+            VALUE = {{ $value }}
+            LABELS = {{ $labels }}
+          summary: HAProxy failed health checks ({{ $labels.instance }}/{{ $labels.proxy }}/{{ $labels.server }})
+        expr: >-
+          increase(haproxy_server_check_failures_total[1m])
+          > 0
+        for: 1m
+        labels:
+          severity: warning