# Prometheus Alerting Rules apiVersion: monitoring.coreos.com/v1 kind: PrometheusRule metadata: name: shared-services-alerts namespace: monitoring labels: app: prometheus spec: groups: - name: kubernetes.rules interval: 30s rules: - alert: PodCrashLooping expr: rate(kube_pod_container_status_restarts_total[15m]) > 0 for: 5m labels: severity: warning annotations: summary: "Pod {{ $labels.namespace }}/{{ $labels.pod }} is crash looping" description: "Pod {{ $labels.namespace }}/{{ $labels.pod }} has restarted {{ $value }} times in the last 15 minutes" - alert: PodNotReady expr: sum by (namespace, pod) (kube_pod_status_phase{phase!~"Running|Succeeded"}) > 0 for: 10m labels: severity: warning annotations: summary: "Pod {{ $labels.namespace }}/{{ $labels.pod }} is not ready" description: "Pod {{ $labels.namespace }}/{{ $labels.pod }} has been in a non-ready state for more than 10 minutes" - alert: HighMemoryUsage expr: (sum by (namespace, pod) (container_memory_usage_bytes) / sum by (namespace, pod) (kube_pod_container_resource_limits_memory_bytes)) > 0.9 for: 5m labels: severity: warning annotations: summary: "Pod {{ $labels.namespace }}/{{ $labels.pod }} memory usage is high" description: "Pod {{ $labels.namespace }}/{{ $labels.pod }} is using {{ $value | humanizePercentage }} of its memory limit" - alert: HighCPUUsage expr: (sum by (namespace, pod) (rate(container_cpu_usage_seconds_total[5m])) / sum by (namespace, pod) (kube_pod_container_resource_limits_cpu)) > 0.9 for: 5m labels: severity: warning annotations: summary: "Pod {{ $labels.namespace }}/{{ $labels.pod }} CPU usage is high" description: "Pod {{ $labels.namespace }}/{{ $labels.pod }} is using {{ $value | humanizePercentage }} of its CPU limit" - name: application.rules interval: 30s rules: - alert: HighErrorRate expr: rate(http_requests_total{status=~"5.."}[5m]) > 0.1 for: 5m labels: severity: critical annotations: summary: "High error rate detected" description: "Error rate is {{ $value }} errors per second" - alert: HighLatency expr: histogram_quantile(0.95, rate(http_request_duration_seconds_bucket[5m])) > 1 for: 10m labels: severity: warning annotations: summary: "High latency detected" description: "95th percentile latency is {{ $value }} seconds" - alert: ServiceDown expr: up{job=~".+"} == 0 for: 1m labels: severity: critical annotations: summary: "Service {{ $labels.job }} is down" description: "Service {{ $labels.job }} has been down for more than 1 minute" - name: infrastructure.rules interval: 30s rules: - alert: NodeNotReady expr: kube_node_status_condition{condition="Ready",status="true"} == 0 for: 5m labels: severity: critical annotations: summary: "Node {{ $labels.node }} is not ready" description: "Node {{ $labels.node }} has been in a not-ready state for more than 5 minutes" - alert: DiskSpaceLow expr: (node_filesystem_avail_bytes / node_filesystem_size_bytes) < 0.1 for: 5m labels: severity: warning annotations: summary: "Disk space low on {{ $labels.instance }}" description: "Disk {{ $labels.device }} on {{ $labels.instance }} has only {{ $value | humanizePercentage }} space available"