groups: - name: service_health interval: 30s rules: - alert: ServiceDown expr: up{job=~".*-service"} == 0 for: 5m labels: severity: critical annotations: summary: "Service {{ $labels.job }} is down" description: "Service {{ $labels.job }} has been down for more than 5 minutes" - alert: HighErrorRate expr: rate(http_requests_total{status=~"5.."}[5m]) > 0.05 for: 5m labels: severity: warning annotations: summary: "High error rate in {{ $labels.job }}" description: "Error rate is {{ $value }} errors per second" - alert: HighResponseTime expr: histogram_quantile(0.95, rate(http_request_duration_seconds_bucket[5m])) > 2 for: 10m labels: severity: warning annotations: summary: "High response time in {{ $labels.job }}" description: "95th percentile response time is {{ $value }} seconds" - name: resource_usage interval: 30s rules: - alert: HighCPUUsage expr: rate(container_cpu_usage_seconds_total[5m]) > 0.8 for: 10m labels: severity: warning annotations: summary: "High CPU usage in {{ $labels.pod }}" description: "CPU usage is {{ $value }}" - alert: HighMemoryUsage expr: container_memory_usage_bytes / container_spec_memory_limit_bytes > 0.9 for: 10m labels: severity: warning annotations: summary: "High memory usage in {{ $labels.pod }}" description: "Memory usage is {{ $value }}%" - alert: PodCrashLooping expr: rate(kube_pod_container_status_restarts_total[15m]) > 0 for: 5m labels: severity: critical annotations: summary: "Pod {{ $labels.pod }} is crash looping" description: "Pod has restarted {{ $value }} times in the last 15 minutes" - name: database interval: 30s rules: - alert: DatabaseConnectionHigh expr: pg_stat_database_numbackends / pg_stat_database_max_connections > 0.8 for: 5m labels: severity: warning annotations: summary: "High database connection usage" description: "{{ $value }}% of max connections in use" - alert: DatabaseSlowQueries expr: rate(pg_stat_statements_mean_exec_time[5m]) > 1 for: 10m labels: severity: warning annotations: summary: "Slow database queries detected" description: "Average query time is {{ $value }} seconds" - name: entra_verifiedid interval: 30s rules: - alert: EntraAPIFailure expr: rate(entra_api_errors_total[5m]) > 0.1 for: 5m labels: severity: critical annotations: summary: "High Entra VerifiedID API error rate" description: "Error rate is {{ $value }} errors per second" - alert: EntraRateLimitApproaching expr: entra_rate_limit_remaining / entra_rate_limit_total < 0.1 for: 5m labels: severity: warning annotations: summary: "Entra VerifiedID rate limit approaching" description: "Only {{ $value }}% of rate limit remaining"