51 lines
1.5 KiB
YAML
51 lines
1.5 KiB
YAML
# Event Bus Monitoring Rules
|
|
|
|
apiVersion: monitoring.coreos.com/v1
|
|
kind: PrometheusRule
|
|
metadata:
|
|
name: event-bus-alerts
|
|
namespace: event-bus
|
|
labels:
|
|
app: nats
|
|
spec:
|
|
groups:
|
|
- name: nats.rules
|
|
interval: 30s
|
|
rules:
|
|
- alert: NATSConnectionFailure
|
|
expr: nats_connz_connections{state!="OPEN"} > 0
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "NATS connection failure detected"
|
|
description: "NATS has {{ $value }} non-open connections"
|
|
|
|
- alert: NATSHighMessageRate
|
|
expr: rate(nats_varz_in_msgs[5m]) > 10000
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "High NATS message rate"
|
|
description: "NATS is processing {{ $value }} messages per second"
|
|
|
|
- alert: NATSJetStreamStorageFull
|
|
expr: (nats_jetstream_varz_store_bytes / nats_jetstream_varz_store_max_bytes) > 0.9
|
|
for: 5m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "NATS JetStream storage nearly full"
|
|
description: "JetStream storage is {{ $value | humanizePercentage }} full"
|
|
|
|
- alert: NATSPodDown
|
|
expr: up{job="nats"} == 0
|
|
for: 1m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "NATS pod is down"
|
|
description: "NATS pod has been down for more than 1 minute"
|
|
|