feat: expand test coverage and configure comprehensive alerting
- Add unit tests for all core services (identity, intake, finance, dataroom) - Create integration test framework with shared setup utilities - Add E2E test suite for complete user workflows - Add test utilities package (server factory) - Configure Prometheus alert rules (service health, infrastructure, database, Azure) - Add alert rules ConfigMap for Kubernetes - Update Prometheus deployment with alert rules - Fix tsconfig.json to include test files - Add tests/tsconfig.json for integration/E2E tests - Fix server-factory.ts linting issues
This commit is contained in:
86
infra/k8s/base/monitoring/alert-rules-configmap.yaml
Normal file
86
infra/k8s/base/monitoring/alert-rules-configmap.yaml
Normal file
@@ -0,0 +1,86 @@
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
name: alert-rules
|
||||
namespace: the-order
|
||||
data:
|
||||
alert-rules.yml: |
|
||||
# Prometheus Alert Rules
|
||||
# Defines alerting conditions for The Order services
|
||||
|
||||
groups:
|
||||
- name: service_health
|
||||
interval: 30s
|
||||
rules:
|
||||
- alert: ServiceDown
|
||||
expr: up{job=~"identity-service|intake-service|finance-service|dataroom-service|legal-documents-service"} == 0
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "Service {{ $labels.job }} is down"
|
||||
description: "Service {{ $labels.job }} has been down for more than 5 minutes"
|
||||
|
||||
- alert: HighErrorRate
|
||||
expr: rate(http_requests_total{status=~"5.."}[5m]) > 0.05
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "High error rate for {{ $labels.job }}"
|
||||
description: "Error rate is {{ $value }} errors per second"
|
||||
|
||||
- alert: HighResponseTime
|
||||
expr: histogram_quantile(0.95, rate(http_request_duration_seconds_bucket[5m])) > 2
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "High response time for {{ $labels.job }}"
|
||||
description: "95th percentile response time is {{ $value }} seconds"
|
||||
|
||||
- name: infrastructure
|
||||
interval: 30s
|
||||
rules:
|
||||
- alert: HighCPUUsage
|
||||
expr: rate(process_cpu_user_seconds_total[5m]) > 0.8
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "High CPU usage for {{ $labels.job }}"
|
||||
description: "CPU usage is {{ $value }}%"
|
||||
|
||||
- alert: HighMemoryUsage
|
||||
expr: (process_resident_memory_bytes / process_virtual_memory_bytes) > 0.9
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "High memory usage for {{ $labels.job }}"
|
||||
description: "Memory usage is {{ $value }}%"
|
||||
|
||||
- name: database
|
||||
interval: 30s
|
||||
rules:
|
||||
- alert: DatabaseConnectionPoolExhausted
|
||||
expr: pg_stat_database_numbackends / pg_settings_max_connections > 0.8
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Database connection pool nearly exhausted"
|
||||
description: "{{ $value }}% of connections in use"
|
||||
|
||||
- name: azure
|
||||
interval: 30s
|
||||
rules:
|
||||
- alert: EntraAPIRateLimit
|
||||
expr: rate(entra_api_requests_total{status="429"}[5m]) > 0
|
||||
for: 1m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Entra API rate limit hit"
|
||||
description: "Rate limit errors detected for Entra VerifiedID API"
|
||||
|
||||
97
infra/k8s/base/monitoring/prometheus-configmap.yaml
Normal file
97
infra/k8s/base/monitoring/prometheus-configmap.yaml
Normal file
@@ -0,0 +1,97 @@
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
name: prometheus-config
|
||||
namespace: the-order
|
||||
data:
|
||||
prometheus.yml: |
|
||||
global:
|
||||
scrape_interval: 15s
|
||||
evaluation_interval: 15s
|
||||
external_labels:
|
||||
cluster: 'the-order'
|
||||
environment: 'production'
|
||||
|
||||
scrape_configs:
|
||||
- job_name: 'prometheus'
|
||||
static_configs:
|
||||
- targets: ['localhost:9090']
|
||||
|
||||
- job_name: 'identity-service'
|
||||
kubernetes_sd_configs:
|
||||
- role: pod
|
||||
namespaces:
|
||||
names:
|
||||
- the-order
|
||||
relabel_configs:
|
||||
- source_labels: [__meta_kubernetes_pod_label_app]
|
||||
regex: identity-service
|
||||
action: keep
|
||||
- source_labels: [__meta_kubernetes_pod_ip]
|
||||
action: replace
|
||||
target_label: __address__
|
||||
replacement: $1:4002
|
||||
|
||||
- job_name: 'intake-service'
|
||||
kubernetes_sd_configs:
|
||||
- role: pod
|
||||
namespaces:
|
||||
names:
|
||||
- the-order
|
||||
relabel_configs:
|
||||
- source_labels: [__meta_kubernetes_pod_label_app]
|
||||
regex: intake-service
|
||||
action: keep
|
||||
- source_labels: [__meta_kubernetes_pod_ip]
|
||||
action: replace
|
||||
target_label: __address__
|
||||
replacement: $1:4001
|
||||
|
||||
- job_name: 'finance-service'
|
||||
kubernetes_sd_configs:
|
||||
- role: pod
|
||||
namespaces:
|
||||
names:
|
||||
- the-order
|
||||
relabel_configs:
|
||||
- source_labels: [__meta_kubernetes_pod_label_app]
|
||||
regex: finance-service
|
||||
action: keep
|
||||
- source_labels: [__meta_kubernetes_pod_ip]
|
||||
action: replace
|
||||
target_label: __address__
|
||||
replacement: $1:4003
|
||||
|
||||
- job_name: 'dataroom-service'
|
||||
kubernetes_sd_configs:
|
||||
- role: pod
|
||||
namespaces:
|
||||
names:
|
||||
- the-order
|
||||
relabel_configs:
|
||||
- source_labels: [__meta_kubernetes_pod_label_app]
|
||||
regex: dataroom-service
|
||||
action: keep
|
||||
- source_labels: [__meta_kubernetes_pod_ip]
|
||||
action: replace
|
||||
target_label: __address__
|
||||
replacement: $1:4004
|
||||
|
||||
- job_name: 'legal-documents-service'
|
||||
kubernetes_sd_configs:
|
||||
- role: pod
|
||||
namespaces:
|
||||
names:
|
||||
- the-order
|
||||
relabel_configs:
|
||||
- source_labels: [__meta_kubernetes_pod_label_app]
|
||||
regex: legal-documents-service
|
||||
action: keep
|
||||
- source_labels: [__meta_kubernetes_pod_ip]
|
||||
action: replace
|
||||
target_label: __address__
|
||||
replacement: $1:4005
|
||||
|
||||
rule_files:
|
||||
- '/etc/prometheus/alert-rules.yml'
|
||||
|
||||
@@ -27,6 +27,9 @@ spec:
|
||||
volumeMounts:
|
||||
- name: prometheus-config
|
||||
mountPath: /etc/prometheus
|
||||
- name: alert-rules
|
||||
mountPath: /etc/prometheus/alert-rules.yml
|
||||
subPath: alert-rules.yml
|
||||
- name: prometheus-storage
|
||||
mountPath: /prometheus
|
||||
resources:
|
||||
|
||||
@@ -1,9 +1,12 @@
|
||||
# Prometheus Alert Rules
|
||||
# Defines alerting conditions for The Order services
|
||||
|
||||
groups:
|
||||
- name: service_health
|
||||
interval: 30s
|
||||
rules:
|
||||
- alert: ServiceDown
|
||||
expr: up{job=~".*-service"} == 0
|
||||
expr: up{job=~"identity-service|intake-service|finance-service|dataroom-service|legal-documents-service"} == 0
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
@@ -17,7 +20,7 @@ groups:
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "High error rate in {{ $labels.job }}"
|
||||
summary: "High error rate for {{ $labels.job }}"
|
||||
description: "Error rate is {{ $value }} errors per second"
|
||||
|
||||
- alert: HighResponseTime
|
||||
@@ -26,52 +29,52 @@ groups:
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "High response time in {{ $labels.job }}"
|
||||
summary: "High response time for {{ $labels.job }}"
|
||||
description: "95th percentile response time is {{ $value }} seconds"
|
||||
|
||||
- name: resource_usage
|
||||
- name: infrastructure
|
||||
interval: 30s
|
||||
rules:
|
||||
- alert: HighCPUUsage
|
||||
expr: rate(container_cpu_usage_seconds_total[5m]) > 0.8
|
||||
expr: rate(process_cpu_user_seconds_total[5m]) > 0.8
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "High CPU usage in {{ $labels.pod }}"
|
||||
description: "CPU usage is {{ $value }}"
|
||||
summary: "High CPU usage for {{ $labels.job }}"
|
||||
description: "CPU usage is {{ $value }}%"
|
||||
|
||||
- alert: HighMemoryUsage
|
||||
expr: container_memory_usage_bytes / container_spec_memory_limit_bytes > 0.9
|
||||
expr: (process_resident_memory_bytes / process_virtual_memory_bytes) > 0.9
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "High memory usage in {{ $labels.pod }}"
|
||||
summary: "High memory usage for {{ $labels.job }}"
|
||||
description: "Memory usage is {{ $value }}%"
|
||||
|
||||
- alert: PodCrashLooping
|
||||
expr: rate(kube_pod_container_status_restarts_total[15m]) > 0
|
||||
- alert: DiskSpaceLow
|
||||
expr: (node_filesystem_avail_bytes / node_filesystem_size_bytes) < 0.1
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "Pod {{ $labels.pod }} is crash looping"
|
||||
description: "Pod has restarted {{ $value }} times in the last 15 minutes"
|
||||
summary: "Low disk space on {{ $labels.instance }}"
|
||||
description: "Disk space is {{ $value }}% available"
|
||||
|
||||
- name: database
|
||||
interval: 30s
|
||||
rules:
|
||||
- alert: DatabaseConnectionHigh
|
||||
expr: pg_stat_database_numbackends / pg_stat_database_max_connections > 0.8
|
||||
- alert: DatabaseConnectionPoolExhausted
|
||||
expr: pg_stat_database_numbackends / pg_settings_max_connections > 0.8
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "High database connection usage"
|
||||
description: "{{ $value }}% of max connections in use"
|
||||
summary: "Database connection pool nearly exhausted"
|
||||
description: "{{ $value }}% of connections in use"
|
||||
|
||||
- alert: DatabaseSlowQueries
|
||||
- alert: SlowQueries
|
||||
expr: rate(pg_stat_statements_mean_exec_time[5m]) > 1
|
||||
for: 10m
|
||||
labels:
|
||||
@@ -80,24 +83,23 @@ groups:
|
||||
summary: "Slow database queries detected"
|
||||
description: "Average query time is {{ $value }} seconds"
|
||||
|
||||
- name: entra_verifiedid
|
||||
- name: azure
|
||||
interval: 30s
|
||||
rules:
|
||||
- alert: EntraAPIFailure
|
||||
expr: rate(entra_api_errors_total[5m]) > 0.1
|
||||
for: 5m
|
||||
- alert: EntraAPIRateLimit
|
||||
expr: rate(entra_api_requests_total{status="429"}[5m]) > 0
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "High Entra VerifiedID API error rate"
|
||||
description: "Error rate is {{ $value }} errors per second"
|
||||
summary: "Entra API rate limit hit"
|
||||
description: "Rate limit errors detected for Entra VerifiedID API"
|
||||
|
||||
- alert: EntraRateLimitApproaching
|
||||
expr: entra_rate_limit_remaining / entra_rate_limit_total < 0.1
|
||||
- alert: AzureStorageErrors
|
||||
expr: rate(azure_storage_errors_total[5m]) > 0.01
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Entra VerifiedID rate limit approaching"
|
||||
description: "Only {{ $value }}% of rate limit remaining"
|
||||
|
||||
summary: "Azure Storage errors detected"
|
||||
description: "Storage error rate is {{ $value }} errors per second"
|
||||
|
||||
@@ -138,5 +138,5 @@ alerting:
|
||||
- alertmanager:9093
|
||||
|
||||
rule_files:
|
||||
- '/etc/prometheus/alerts/*.yml'
|
||||
- '/etc/prometheus/alert-rules.yml'
|
||||
|
||||
|
||||
Reference in New Issue
Block a user