Files
infrastructure/monitoring/event-monitoring/prometheus-rules.yaml
2026-02-09 21:51:46 -08:00

51 lines
1.5 KiB
YAML

# Event Bus Monitoring Rules
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
name: event-bus-alerts
namespace: event-bus
labels:
app: nats
spec:
groups:
- name: nats.rules
interval: 30s
rules:
- alert: NATSConnectionFailure
expr: nats_connz_connections{state!="OPEN"} > 0
for: 5m
labels:
severity: warning
annotations:
summary: "NATS connection failure detected"
description: "NATS has {{ $value }} non-open connections"
- alert: NATSHighMessageRate
expr: rate(nats_varz_in_msgs[5m]) > 10000
for: 10m
labels:
severity: warning
annotations:
summary: "High NATS message rate"
description: "NATS is processing {{ $value }} messages per second"
- alert: NATSJetStreamStorageFull
expr: (nats_jetstream_varz_store_bytes / nats_jetstream_varz_store_max_bytes) > 0.9
for: 5m
labels:
severity: critical
annotations:
summary: "NATS JetStream storage nearly full"
description: "JetStream storage is {{ $value | humanizePercentage }} full"
- alert: NATSPodDown
expr: up{job="nats"} == 0
for: 1m
labels:
severity: critical
annotations:
summary: "NATS pod is down"
description: "NATS pod has been down for more than 1 minute"