- Add Cloud for Sovereignty landing zone architecture and deployment - Implement complete legal document management system - Reorganize documentation with improved navigation - Add infrastructure improvements (Dockerfiles, K8s, monitoring) - Add operational improvements (graceful shutdown, rate limiting, caching) - Create comprehensive project structure documentation - Add Azure deployment automation scripts - Improve repository navigation and organization
104 lines
3.3 KiB
YAML
104 lines
3.3 KiB
YAML
groups:
|
|
- name: service_health
|
|
interval: 30s
|
|
rules:
|
|
- alert: ServiceDown
|
|
expr: up{job=~".*-service"} == 0
|
|
for: 5m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "Service {{ $labels.job }} is down"
|
|
description: "Service {{ $labels.job }} has been down for more than 5 minutes"
|
|
|
|
- alert: HighErrorRate
|
|
expr: rate(http_requests_total{status=~"5.."}[5m]) > 0.05
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "High error rate in {{ $labels.job }}"
|
|
description: "Error rate is {{ $value }} errors per second"
|
|
|
|
- alert: HighResponseTime
|
|
expr: histogram_quantile(0.95, rate(http_request_duration_seconds_bucket[5m])) > 2
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "High response time in {{ $labels.job }}"
|
|
description: "95th percentile response time is {{ $value }} seconds"
|
|
|
|
- name: resource_usage
|
|
interval: 30s
|
|
rules:
|
|
- alert: HighCPUUsage
|
|
expr: rate(container_cpu_usage_seconds_total[5m]) > 0.8
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "High CPU usage in {{ $labels.pod }}"
|
|
description: "CPU usage is {{ $value }}"
|
|
|
|
- alert: HighMemoryUsage
|
|
expr: container_memory_usage_bytes / container_spec_memory_limit_bytes > 0.9
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "High memory usage in {{ $labels.pod }}"
|
|
description: "Memory usage is {{ $value }}%"
|
|
|
|
- alert: PodCrashLooping
|
|
expr: rate(kube_pod_container_status_restarts_total[15m]) > 0
|
|
for: 5m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "Pod {{ $labels.pod }} is crash looping"
|
|
description: "Pod has restarted {{ $value }} times in the last 15 minutes"
|
|
|
|
- name: database
|
|
interval: 30s
|
|
rules:
|
|
- alert: DatabaseConnectionHigh
|
|
expr: pg_stat_database_numbackends / pg_stat_database_max_connections > 0.8
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "High database connection usage"
|
|
description: "{{ $value }}% of max connections in use"
|
|
|
|
- alert: DatabaseSlowQueries
|
|
expr: rate(pg_stat_statements_mean_exec_time[5m]) > 1
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Slow database queries detected"
|
|
description: "Average query time is {{ $value }} seconds"
|
|
|
|
- name: entra_verifiedid
|
|
interval: 30s
|
|
rules:
|
|
- alert: EntraAPIFailure
|
|
expr: rate(entra_api_errors_total[5m]) > 0.1
|
|
for: 5m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "High Entra VerifiedID API error rate"
|
|
description: "Error rate is {{ $value }} errors per second"
|
|
|
|
- alert: EntraRateLimitApproaching
|
|
expr: entra_rate_limit_remaining / entra_rate_limit_total < 0.1
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Entra VerifiedID rate limit approaching"
|
|
description: "Only {{ $value }}% of rate limit remaining"
|
|
|