# yamllint disable rule:line-length
#####################################################
## MANAGED BY SALT in salt/files/prometheus/alerts ##
#####################################################
---
groups:
- name: ioo-haproxy
rules:
- alert: HAProxyHighHttp4xxErrorRateBackend
annotations:
description: |
Too many HTTP requests with status 4xx (> 5%) for backend {{ $labels.proxy }} on {{ $labels.instance }}
VALUE = {{ $value }}
LABELS = {{ $labels }}
summary: HAProxy high HTTP 4xx error rate backend ({{ $labels.instance }}/{{ $labels.proxy }})
expr: >-
(
(
sum by (instance, proxy) (rate(haproxy_server_http_responses_total{code="4xx"}[5m]))
/
sum by (instance, proxy) (rate(haproxy_server_http_responses_total[5m]))
* 100
)
> 5
)
and on (instance, proxy) (rate(haproxy_server_http_responses_total{code="4xx"}[5m]) * 60)
> 10
for: 1m
labels:
severity: info
- alert: HAProxyHighHttp5xxErrorRateBackend
annotations:
description: |
Too many HTTP requests with status 5xx (> 5%) for backend {{ $labels.proxy }} on {{ $labels.instance }}
VALUE = {{ $value }}
LABELS = {{ $labels }}
summary: HAProxy high HTTP 5xx error rate backend ({{ $labels.instance }}/{{ $labels.proxy }})
expr: >-
(
sum by (instance, proxy) (rate(haproxy_server_http_responses_total{code="5xx"}[1m]))
/
sum by (instance, proxy) (rate(haproxy_server_http_responses_total[1m]))
* 100
)
> 5
for: 1m
labels:
# should be warning, but there are existing problematic backends :-(
severity: info
# ErrorRateServer rules are only engaged for backends with more than one server to avoid duplicating the above
- alert: HAProxyHighHttp4xxErrorRateServer
annotations:
description: |
Too many HTTP requests with status 4xx (> 5%) for server {{ $labels.server }} on {{ $labels.instance }}.
VALUE = {{ $value }}
LABELS = {{ $labels }}
summary: HAProxy high HTTP 4xx error rate server ({{ $labels.instance }}/{{ $labels.server }})
expr: >-
(
(
sum by (instance, proxy, server) (rate(haproxy_server_http_responses_total{code="4xx"}[5m]))
/
sum by (instance, proxy, server) (rate(haproxy_server_http_responses_total[5m]))
* 100
)
> 5
)
and on (instance, proxy, server) (rate(haproxy_server_http_responses_total{code="4xx"}[5m]) * 60)
> 10
and on(instance, proxy) haproxy_backend_active_servers
> 1
for: 1m
labels:
severity: info
- alert: HAProxyHighHttp5xxErrorRateServer
annotations:
description: |
Too many HTTP requests with status 5xx (> 5%) for server {{ $labels.server }} on {{ $labels.instance }}.
VALUE = {{ $value }}
LABELS = {{ $labels }}
summary: HAProxy high HTTP 5xx error rate server ({{ $labels.instance }}/{{ $labels.server }})
expr: >-
(
sum by (instance, proxy, server) (rate(haproxy_server_http_responses_total{code="5xx"}[1m]) * 100)
/
sum by (instance, proxy, server) (rate(haproxy_server_http_responses_total[1m]))
and on(instance, proxy) haproxy_backend_active_servers > 1
)
> 5
for: 1m
labels:
# should be warning, but there are existing problematic backends :-(
severity: info
- alert: HAProxyServerResponseErrors
annotations:
description: |
Too many response errors to server {{ $labels.server }} on {{ $labels.instance }} (> 5%).
VALUE = {{ $value }}
LABELS = {{ $labels }}
summary: HAProxy server response errors ({{ $labels.instance }}/{{ $labels.server }})
expr: >-
sum by (instance, server) (rate(haproxy_server_response_errors_total[1m]) * 100)
/
sum by (instance, server) (rate(haproxy_server_http_responses_total[1m]))
> 5
for: 1m
labels:
severity: warning
# filter out backends with no servers because ones serving only errorfiles (for example conncheck) seem to report bogus connection errors
- alert: HAProxyBackendConnectionErrors
annotations:
description: |
Too many connection errors to backend {{ $labels.proxy }} from {{ $labels.instance }} (> 100 req/s). Request throughput may be too high.
VALUE = {{ $value }}
LABELS = {{ $labels }}
summary: HAProxy backend connection errors ({{ $labels.instance }}/{{ $labels.proxy }})
expr: >-
(
sum by (instance, proxy) (rate(haproxy_backend_connection_errors_total[1m]))
and on(instance, proxy) haproxy_backend_active_servers > 0
)
> 100
for: 1m
labels:
severity: critical
# ServerConnectionErrors rule is only engaged for backends with more than one server to avoid duplicating the above
- alert: HAProxyServerConnectionErrors
annotations:
description: |
Too many connection errors to server {{ $labels.server }} from {{ $labels.instance }} (> 100 req/s). Request throughput may be too high.
VALUE = {{ $value }}
LABELS = {{ $labels }}
summary: HAProxy server connection errors ({{ $labels.instance }}/{{ $labels.proxy }}/{{ $labels.server }})
expr: >-
(
sum by (instance, proxy, server) (rate(haproxy_server_connection_errors_total[1m]))
and on(instance, proxy) haproxy_backend_active_servers > 1
)
> 100
for: 0m
labels:
severity: critical
- alert: HAProxyBackendMaxActiveSession
annotations:
description: |
HAProxy backend {{ $labels.proxy }} on {{ $labels.instance }} is reaching the session limit (> 80%).
VALUE = {{ $value }}
LABELS = {{ $labels }}
summary: HAProxy backend max active session ({{ $labels.instance }}/{{ $labels.proxy }})
expr: >-
(
sum by (proxy) (avg_over_time(haproxy_backend_max_sessions[2m]) * 100)
/
sum by (proxy) (avg_over_time(haproxy_backend_limit_sessions[2m]))
)
> 80
for: 2m
labels:
severity: warning
- alert: HAProxyPendingRequests
annotations:
description: |
Some proxy requests are pending on backend {{ $labels.proxy }} on {{ $labels.instance }}.
VALUE = {{ $value }}
LABELS = {{ $labels }}
summary: HAProxy pending requests ({{ $labels.instance }}/{{ $labels.proxy }})
expr: >-
sum by (proxy) (haproxy_backend_current_queue)
> 0
for: 2m
labels:
severity: warning
# I do not fully understand this metric, dividing by 1024 feels wrong, but the HELP text says:
# "haproxy_backend_total_time_average_seconds Avg. total time for last 1024 successful connections"
# https://github.com/haproxy/haproxy/issues/2317
- alert: HAProxyBackendSlowingDown
annotations:
description: |
Average request time is increasing for backend {{ $labels.proxy }} on {{ $labels.instance }}.
VALUE = {{ $value }}
LABELS = {{ $labels }}
summary: HAProxy backend slowing down ({{ $labels.instance }}/{{ $labels.proxy }})
expr: >-
avg by (proxy)
(
haproxy_backend_total_time_average_seconds / 1024
)
> 1
for: 1m
labels:
severity: warning
- alert: HAProxyRetryHigh
annotations:
description: |
High rate of retry on backend {{ $labels.proxy }} on {{ $labels.instance }}.
VALUE = {{ $value }}
LABELS = {{ $labels }}
summary: HAProxy retry high ({{ $labels.instance }}/{{ $labels.proxy }})
expr: >-
sum by (proxy) (rate(haproxy_backend_retry_warnings_total[1m]))
> 3
for: 2m
labels:
severity: warning
- alert: HAProxyBackendServerDown
annotations:
description: |
HAProxy server {{ $labels.server }} in backend {{ $labels.proxy }} on {{ $labels.instance }} is down.
VALUE = {{ $value }}
LABELS = {{ $labels }}
summary: HAProxy server down ({{ $labels.instance }}/{{ $labels.proxy }}/{{ $labels.server }})
expr: >-
haproxy_server_status{state="DOWN"}
> 0
for: 30s
labels:
severity: warning
- alert: HAProxyFailedHealthChecks
annotations:
description: |
HAProxy health checks for server {{ $labels.server }} in backend {{ $labels.proxy }} on {{ $labels.instance }} are failing.
VALUE = {{ $value }}
LABELS = {{ $labels }}
summary: HAProxy failed health checks ({{ $labels.instance }}/{{ $labels.proxy }}/{{ $labels.server }})
expr: >-
increase(haproxy_server_check_failures_total[1m])
> 0
for: 1m
labels:
severity: warning