chore: sync submodule state (parent ref update)
Made-with: Cursor
This commit is contained in:
73
monitoring/alerts.yml
Normal file
73
monitoring/alerts.yml
Normal file
@@ -0,0 +1,73 @@
|
||||
# Prometheus alerting rules for SolaceNet
|
||||
|
||||
groups:
|
||||
- name: solacenet_capabilities
|
||||
interval: 30s
|
||||
rules:
|
||||
- alert: CapabilityDisabled
|
||||
expr: solacenet_capability_state{state="disabled"} > 0
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Capability {{ $labels.capability_id }} is disabled"
|
||||
description: "Capability {{ $labels.capability_id }} has been disabled for {{ $labels.tenant_id }}"
|
||||
|
||||
- alert: KillSwitchActivated
|
||||
expr: increase(solacenet_kill_switch_activations_total[5m]) > 0
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "Kill switch activated for {{ $labels.capability_id }}"
|
||||
description: "Emergency kill switch was activated for capability {{ $labels.capability_id }}"
|
||||
|
||||
- alert: HighPolicyDecisionLatency
|
||||
expr: histogram_quantile(0.95, solacenet_policy_decision_duration_seconds_bucket) > 1
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "High policy decision latency"
|
||||
description: "95th percentile policy decision latency is {{ $value }}s"
|
||||
|
||||
- name: solacenet_risk
|
||||
interval: 30s
|
||||
rules:
|
||||
- alert: HighRiskScore
|
||||
expr: solacenet_risk_score > 80
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "High risk score detected"
|
||||
description: "Risk score of {{ $value }} detected for transaction {{ $labels.transaction_id }}"
|
||||
|
||||
- alert: RiskEngineDown
|
||||
expr: up{job="risk-engine"} == 0
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "Risk engine is down"
|
||||
description: "Risk rules engine is not responding"
|
||||
|
||||
- name: solacenet_infrastructure
|
||||
interval: 30s
|
||||
rules:
|
||||
- alert: RedisDown
|
||||
expr: up{job="redis"} == 0
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "Redis is down"
|
||||
description: "Redis cache is not available, policy decisions will not be cached"
|
||||
|
||||
- alert: GatewayDown
|
||||
expr: up{job="solacenet-gateway"} == 0
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "SolaceNet Gateway is down"
|
||||
description: "The SolaceNet API Gateway is not responding"
|
||||
85
monitoring/as4-alerts.yml
Normal file
85
monitoring/as4-alerts.yml
Normal file
@@ -0,0 +1,85 @@
|
||||
# Prometheus Alerting Rules for AS4 Settlement
|
||||
|
||||
groups:
|
||||
- name: as4_settlement
|
||||
interval: 30s
|
||||
rules:
|
||||
# High Latency Alert
|
||||
- alert: AS4HighLatency
|
||||
expr: as4_message_latency_p99 > 5
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "AS4 message processing latency is high"
|
||||
description: "P99 latency is {{ $value }}s (threshold: 5s)"
|
||||
|
||||
# High Failure Rate Alert
|
||||
- alert: AS4HighFailureRate
|
||||
expr: rate(as4_instructions_failed[5m]) > 0.01
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "AS4 instruction failure rate is high"
|
||||
description: "Failure rate is {{ $value }} (threshold: 1%)"
|
||||
|
||||
# Certificate Expiring Alert
|
||||
- alert: AS4CertificateExpiring
|
||||
expr: as4_certificate_days_until_expiry < 30
|
||||
for: 1h
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "AS4 certificate expiring soon"
|
||||
description: "Certificate expires in {{ $value }} days"
|
||||
|
||||
# System Unavailable Alert
|
||||
- alert: AS4SystemUnavailable
|
||||
expr: up{job="as4-settlement"} == 0
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "AS4 Settlement system is down"
|
||||
description: "AS4 service is not responding"
|
||||
|
||||
# Database Connection Alert
|
||||
- alert: AS4DatabaseConnectionFailed
|
||||
expr: as4_database_connection_status == 0
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "AS4 database connection failed"
|
||||
description: "Cannot connect to database"
|
||||
|
||||
# Redis Connection Alert
|
||||
- alert: AS4RedisConnectionFailed
|
||||
expr: as4_redis_connection_status == 0
|
||||
for: 1m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "AS4 Redis connection failed"
|
||||
description: "Cannot connect to Redis (nonce tracking may be affected)"
|
||||
|
||||
# High Memory Usage Alert
|
||||
- alert: AS4HighMemoryUsage
|
||||
expr: as4_memory_usage_percent > 80
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "AS4 system memory usage is high"
|
||||
description: "Memory usage is {{ $value }}%"
|
||||
|
||||
# Queue Backlog Alert
|
||||
- alert: AS4QueueBacklog
|
||||
expr: as4_instruction_queue_length > 1000
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "AS4 instruction queue backlog"
|
||||
description: "Queue length is {{ $value }} instructions"
|
||||
234
monitoring/grafana/dashboards/README.md
Normal file
234
monitoring/grafana/dashboards/README.md
Normal file
@@ -0,0 +1,234 @@
|
||||
# Grafana Dashboards
|
||||
|
||||
This directory contains Grafana dashboard JSON files for monitoring the DBIS Core Banking System.
|
||||
|
||||
## Dashboard List
|
||||
|
||||
### 1. System Health Dashboard (`system-health.json`)
|
||||
|
||||
**Purpose**: Overall system health and status monitoring
|
||||
|
||||
**Key Metrics**:
|
||||
- Service health status
|
||||
- Overall system availability
|
||||
- Error rates (5xx, 4xx)
|
||||
- CPU and memory usage by service
|
||||
- Database connection pool status
|
||||
- Active sessions
|
||||
- Queue lengths
|
||||
|
||||
**Refresh Interval**: 30s
|
||||
|
||||
**Tags**: `system`, `health`, `overview`
|
||||
|
||||
---
|
||||
|
||||
### 2. API Performance Dashboard (`api-performance.json`)
|
||||
|
||||
**Purpose**: API endpoint performance and latency monitoring
|
||||
|
||||
**Key Metrics**:
|
||||
- Request rate by endpoint
|
||||
- Response time percentiles (P50, P95, P99)
|
||||
- Error rate by endpoint
|
||||
- Top endpoints by request volume
|
||||
- Request distribution by method and status code
|
||||
- SLO compliance (availability, latency)
|
||||
- Request duration distribution
|
||||
|
||||
**Refresh Interval**: 30s
|
||||
|
||||
**Tags**: `api`, `performance`, `latency`
|
||||
|
||||
---
|
||||
|
||||
### 3. Ledger Operations Dashboard (`ledger-operations.json`)
|
||||
|
||||
**Purpose**: Ledger entry and settlement operations monitoring
|
||||
|
||||
**Key Metrics**:
|
||||
- Ledger entry rate by ledger ID
|
||||
- Ledger entry amount by ledger and currency
|
||||
- Settlement rate by status
|
||||
- Settlement duration percentiles
|
||||
- Outbox queue status and processing rate
|
||||
- Balance updates by currency
|
||||
- Failed posting operations
|
||||
- Total ledger entries, active accounts, pending settlements
|
||||
|
||||
**Refresh Interval**: 30s
|
||||
|
||||
**Tags**: `ledger`, `transactions`, `settlement`
|
||||
|
||||
---
|
||||
|
||||
### 4. Security & Compliance Dashboard (`security-compliance.json`)
|
||||
|
||||
**Purpose**: Security events and compliance monitoring
|
||||
|
||||
**Key Metrics**:
|
||||
- Authentication failures by reason
|
||||
- Authorization failures by resource and action
|
||||
- Sanctions screening results
|
||||
- AML risk score distribution
|
||||
- Audit log events by type
|
||||
- Policy violations by type
|
||||
- Failed transactions by reason
|
||||
- Encryption key rotation status
|
||||
- Data access events (PII, Financial)
|
||||
- Security incidents and compliance violations (24h)
|
||||
|
||||
**Refresh Interval**: 30s
|
||||
|
||||
**Tags**: `security`, `compliance`, `audit`
|
||||
|
||||
---
|
||||
|
||||
## Installation
|
||||
|
||||
### Import Dashboards to Grafana
|
||||
|
||||
1. **Via Grafana UI**:
|
||||
- Navigate to Grafana → Dashboards → Import
|
||||
- Upload the JSON file or paste JSON content
|
||||
- Configure data source and settings
|
||||
- Save dashboard
|
||||
|
||||
2. **Via Grafana Provisioning**:
|
||||
|
||||
Create a provisioning configuration file:
|
||||
|
||||
```yaml
|
||||
# grafana/provisioning/dashboards/dashboards.yml
|
||||
apiVersion: 1
|
||||
|
||||
providers:
|
||||
- name: 'DBIS Core Dashboards'
|
||||
orgId: 1
|
||||
folder: 'DBIS Core'
|
||||
type: file
|
||||
disableDeletion: false
|
||||
updateIntervalSeconds: 10
|
||||
allowUiUpdates: true
|
||||
options:
|
||||
path: /etc/grafana/dashboards
|
||||
```
|
||||
|
||||
Copy dashboard files to the provisioned path:
|
||||
|
||||
```bash
|
||||
cp dbis_core/monitoring/grafana/dashboards/*.json /etc/grafana/dashboards/
|
||||
```
|
||||
|
||||
3. **Via Grafana API**:
|
||||
|
||||
```bash
|
||||
# Import dashboard via API
|
||||
curl -X POST \
|
||||
-H "Content-Type: application/json" \
|
||||
-H "Authorization: Bearer <grafana-api-key>" \
|
||||
-d @system-health.json \
|
||||
http://grafana:3000/api/dashboards/db
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Configuration
|
||||
|
||||
### Data Source Configuration
|
||||
|
||||
Ensure Prometheus data source is configured in Grafana:
|
||||
|
||||
1. Navigate to Configuration → Data Sources
|
||||
2. Add Prometheus data source
|
||||
3. Set URL: `http://prometheus:9090`
|
||||
4. Configure scrape interval and timeouts
|
||||
|
||||
### Variable Configuration
|
||||
|
||||
Some dashboards may use variables for filtering:
|
||||
|
||||
- `$datasource`: Prometheus data source
|
||||
- `$service`: Service name filter (optional)
|
||||
- `$environment`: Environment filter (optional)
|
||||
|
||||
---
|
||||
|
||||
## Metrics Requirements
|
||||
|
||||
### Prometheus Metrics
|
||||
|
||||
These dashboards expect the following Prometheus metrics to be exported:
|
||||
|
||||
#### System Metrics
|
||||
- `up{job="dbis-core"}`
|
||||
- `process_cpu_seconds_total{job="dbis-core"}`
|
||||
- `process_resident_memory_bytes{job="dbis-core"}`
|
||||
- `db_pool_size{job="dbis-core"}`
|
||||
- `db_pool_active{job="dbis-core"}`
|
||||
- `db_pool_idle{job="dbis-core"}`
|
||||
|
||||
#### API Metrics
|
||||
- `http_requests_total{job="dbis-core",endpoint,method,status}`
|
||||
- `http_request_duration_seconds_bucket{job="dbis-core",endpoint,le}`
|
||||
|
||||
#### Ledger Metrics
|
||||
- `ledger_entries_total{ledger_id}`
|
||||
- `ledger_entry_amount_total{ledger_id,currency_code}`
|
||||
- `settlement_total{status}`
|
||||
- `settlement_duration_seconds_bucket{le}`
|
||||
- `dbis_outbox_queue_length`
|
||||
- `outbox_processed_total{status}`
|
||||
- `balance_updates_total{currency_code}`
|
||||
- `ledger_posting_errors_total{error_type}`
|
||||
|
||||
#### Security Metrics
|
||||
- `authentication_failures_total{reason}`
|
||||
- `authorization_failures_total{resource,action}`
|
||||
- `sanctions_screening_total{result}`
|
||||
- `aml_risk_score_bucket{le}`
|
||||
- `audit_log_events_total{event_type}`
|
||||
- `policy_violations_total{policy_type,violation_type}`
|
||||
- `transaction_failures_total{reason}`
|
||||
- `data_access_events_total{data_type,operation}`
|
||||
- `security_incidents_total`
|
||||
- `compliance_violations_total`
|
||||
|
||||
---
|
||||
|
||||
## Alerting
|
||||
|
||||
### Recommended Alerts
|
||||
|
||||
Based on these dashboards, configure alerts for:
|
||||
|
||||
1. **System Health**:
|
||||
- Service down (`up{job="dbis-core"} == 0`)
|
||||
- High error rate (`rate(http_requests_total{status=~"5.."}[5m]) > 0.05`)
|
||||
- High memory usage (`process_resident_memory_bytes > 8GB`)
|
||||
- Database connection pool exhausted (`db_pool_active >= db_pool_size * 0.9`)
|
||||
|
||||
2. **API Performance**:
|
||||
- P95 latency > 500ms
|
||||
- Availability < 99.9%
|
||||
- Error rate > 0.1%
|
||||
|
||||
3. **Ledger Operations**:
|
||||
- Outbox queue length > 1000
|
||||
- Settlement failure rate > 1%
|
||||
- Failed posting operations > 10/min
|
||||
|
||||
4. **Security & Compliance**:
|
||||
- Authentication failure rate > 5%
|
||||
- Sanctions match detected
|
||||
- AML risk score > 80
|
||||
- Security incident detected
|
||||
- Compliance violation detected
|
||||
|
||||
---
|
||||
|
||||
## References
|
||||
|
||||
- Metrics Specification: `explorer-monorepo/docs/specs/observability/metrics-monitoring.md`
|
||||
- Tracing Dashboard: `smom-dbis-138/monitoring/grafana/dashboards/tracing.json`
|
||||
- OpenTelemetry Configuration: `smom-dbis-138/monitoring/opentelemetry/otel-collector.yaml`
|
||||
158
monitoring/grafana/dashboards/api-performance.json
Normal file
158
monitoring/grafana/dashboards/api-performance.json
Normal file
@@ -0,0 +1,158 @@
|
||||
{
|
||||
"dashboard": {
|
||||
"title": "DBIS Core - API Performance",
|
||||
"tags": ["api", "performance", "latency"],
|
||||
"timezone": "browser",
|
||||
"schemaVersion": 27,
|
||||
"version": 1,
|
||||
"refresh": "30s",
|
||||
"panels": [
|
||||
{
|
||||
"id": 1,
|
||||
"title": "Request Rate",
|
||||
"type": "graph",
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(rate(http_requests_total{job=\"dbis-core\"}[5m])) by (endpoint)",
|
||||
"legendFormat": "{{endpoint}}"
|
||||
}
|
||||
],
|
||||
"gridPos": {"h": 8, "w": 12, "x": 0, "y": 0}
|
||||
},
|
||||
{
|
||||
"id": 2,
|
||||
"title": "Response Time Percentiles",
|
||||
"type": "graph",
|
||||
"targets": [
|
||||
{
|
||||
"expr": "histogram_quantile(0.50, sum(rate(http_request_duration_seconds_bucket{job=\"dbis-core\"}[5m])) by (le, endpoint))",
|
||||
"legendFormat": "{{endpoint}} - P50"
|
||||
},
|
||||
{
|
||||
"expr": "histogram_quantile(0.95, sum(rate(http_request_duration_seconds_bucket{job=\"dbis-core\"}[5m])) by (le, endpoint))",
|
||||
"legendFormat": "{{endpoint}} - P95"
|
||||
},
|
||||
{
|
||||
"expr": "histogram_quantile(0.99, sum(rate(http_request_duration_seconds_bucket{job=\"dbis-core\"}[5m])) by (le, endpoint))",
|
||||
"legendFormat": "{{endpoint}} - P99"
|
||||
}
|
||||
],
|
||||
"gridPos": {"h": 8, "w": 12, "x": 12, "y": 0}
|
||||
},
|
||||
{
|
||||
"id": 3,
|
||||
"title": "Error Rate by Endpoint",
|
||||
"type": "graph",
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(rate(http_requests_total{job=\"dbis-core\",status=~\"5..\"}[5m])) by (endpoint)",
|
||||
"legendFormat": "{{endpoint}} - 5xx"
|
||||
},
|
||||
{
|
||||
"expr": "sum(rate(http_requests_total{job=\"dbis-core\",status=~\"4..\"}[5m])) by (endpoint)",
|
||||
"legendFormat": "{{endpoint}} - 4xx"
|
||||
}
|
||||
],
|
||||
"gridPos": {"h": 8, "w": 12, "x": 0, "y": 8}
|
||||
},
|
||||
{
|
||||
"id": 4,
|
||||
"title": "Top Endpoints by Request Volume",
|
||||
"type": "bargraph",
|
||||
"targets": [
|
||||
{
|
||||
"expr": "topk(10, sum(rate(http_requests_total{job=\"dbis-core\"}[5m])) by (endpoint))",
|
||||
"legendFormat": "{{endpoint}}"
|
||||
}
|
||||
],
|
||||
"gridPos": {"h": 8, "w": 12, "x": 12, "y": 8}
|
||||
},
|
||||
{
|
||||
"id": 5,
|
||||
"title": "Request Rate by Method",
|
||||
"type": "piechart",
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(rate(http_requests_total{job=\"dbis-core\"}[5m])) by (method)",
|
||||
"legendFormat": "{{method}}"
|
||||
}
|
||||
],
|
||||
"gridPos": {"h": 8, "w": 8, "x": 0, "y": 16}
|
||||
},
|
||||
{
|
||||
"id": 6,
|
||||
"title": "Request Rate by Status Code",
|
||||
"type": "piechart",
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(rate(http_requests_total{job=\"dbis-core\"}[5m])) by (status)",
|
||||
"legendFormat": "{{status}}"
|
||||
}
|
||||
],
|
||||
"gridPos": {"h": 8, "w": 8, "x": 8, "y": 16}
|
||||
},
|
||||
{
|
||||
"id": 7,
|
||||
"title": "SLO Compliance - Availability",
|
||||
"type": "stat",
|
||||
"targets": [
|
||||
{
|
||||
"expr": "(1 - (sum(rate(http_requests_total{job=\"dbis-core\",status=~\"5..\"}[5m])) / sum(rate(http_requests_total{job=\"dbis-core\"}[5m])))) * 100",
|
||||
"legendFormat": "Availability %"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{"value": 0, "color": "red"},
|
||||
{"value": 99.9, "color": "yellow"},
|
||||
{"value": 99.99, "color": "green"}
|
||||
]
|
||||
},
|
||||
"unit": "percent"
|
||||
}
|
||||
},
|
||||
"gridPos": {"h": 4, "w": 4, "x": 16, "y": 16}
|
||||
},
|
||||
{
|
||||
"id": 8,
|
||||
"title": "SLO Compliance - P95 Latency",
|
||||
"type": "stat",
|
||||
"targets": [
|
||||
{
|
||||
"expr": "histogram_quantile(0.95, sum(rate(http_request_duration_seconds_bucket{job=\"dbis-core\"}[5m])) by (le))",
|
||||
"legendFormat": "P95 Latency"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{"value": 0, "color": "green"},
|
||||
{"value": 0.5, "color": "yellow"},
|
||||
{"value": 1.0, "color": "red"}
|
||||
]
|
||||
},
|
||||
"unit": "s"
|
||||
}
|
||||
},
|
||||
"gridPos": {"h": 4, "w": 4, "x": 20, "y": 16}
|
||||
},
|
||||
{
|
||||
"id": 9,
|
||||
"title": "Request Duration Distribution",
|
||||
"type": "heatmap",
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(rate(http_request_duration_seconds_bucket{job=\"dbis-core\"}[5m])) by (le)",
|
||||
"legendFormat": "{{le}}"
|
||||
}
|
||||
],
|
||||
"gridPos": {"h": 8, "w": 24, "x": 0, "y": 24}
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
164
monitoring/grafana/dashboards/ledger-operations.json
Normal file
164
monitoring/grafana/dashboards/ledger-operations.json
Normal file
@@ -0,0 +1,164 @@
|
||||
{
|
||||
"dashboard": {
|
||||
"title": "DBIS Core - Ledger Operations",
|
||||
"tags": ["ledger", "transactions", "settlement"],
|
||||
"timezone": "browser",
|
||||
"schemaVersion": 27,
|
||||
"version": 1,
|
||||
"refresh": "30s",
|
||||
"panels": [
|
||||
{
|
||||
"id": 1,
|
||||
"title": "Ledger Entry Rate",
|
||||
"type": "graph",
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(rate(ledger_entries_total[5m])) by (ledger_id)",
|
||||
"legendFormat": "{{ledger_id}}"
|
||||
}
|
||||
],
|
||||
"gridPos": {"h": 8, "w": 12, "x": 0, "y": 0}
|
||||
},
|
||||
{
|
||||
"id": 2,
|
||||
"title": "Ledger Entry Amount by Ledger",
|
||||
"type": "graph",
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(rate(ledger_entry_amount_total[5m])) by (ledger_id, currency_code)",
|
||||
"legendFormat": "{{ledger_id}} - {{currency_code}}"
|
||||
}
|
||||
],
|
||||
"gridPos": {"h": 8, "w": 12, "x": 12, "y": 0}
|
||||
},
|
||||
{
|
||||
"id": 3,
|
||||
"title": "Settlement Rate",
|
||||
"type": "graph",
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(rate(settlement_total[5m])) by (status)",
|
||||
"legendFormat": "{{status}}"
|
||||
}
|
||||
],
|
||||
"gridPos": {"h": 8, "w": 12, "x": 0, "y": 8}
|
||||
},
|
||||
{
|
||||
"id": 4,
|
||||
"title": "Settlement Duration",
|
||||
"type": "graph",
|
||||
"targets": [
|
||||
{
|
||||
"expr": "histogram_quantile(0.50, sum(rate(settlement_duration_seconds_bucket[5m])) by (le))",
|
||||
"legendFormat": "P50"
|
||||
},
|
||||
{
|
||||
"expr": "histogram_quantile(0.95, sum(rate(settlement_duration_seconds_bucket[5m])) by (le))",
|
||||
"legendFormat": "P95"
|
||||
},
|
||||
{
|
||||
"expr": "histogram_quantile(0.99, sum(rate(settlement_duration_seconds_bucket[5m])) by (le))",
|
||||
"legendFormat": "P99"
|
||||
}
|
||||
],
|
||||
"gridPos": {"h": 8, "w": 12, "x": 12, "y": 8}
|
||||
},
|
||||
{
|
||||
"id": 5,
|
||||
"title": "Outbox Queue Status",
|
||||
"type": "graph",
|
||||
"targets": [
|
||||
{
|
||||
"expr": "dbis_outbox_queue_length",
|
||||
"legendFormat": "Queue Length"
|
||||
}
|
||||
],
|
||||
"gridPos": {"h": 8, "w": 12, "x": 0, "y": 16}
|
||||
},
|
||||
{
|
||||
"id": 6,
|
||||
"title": "Outbox Processing Rate",
|
||||
"type": "graph",
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(rate(outbox_processed_total[5m])) by (status)",
|
||||
"legendFormat": "{{status}}"
|
||||
}
|
||||
],
|
||||
"gridPos": {"h": 8, "w": 12, "x": 12, "y": 16}
|
||||
},
|
||||
{
|
||||
"id": 7,
|
||||
"title": "Balance Updates",
|
||||
"type": "graph",
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(rate(balance_updates_total[5m])) by (currency_code)",
|
||||
"legendFormat": "{{currency_code}}"
|
||||
}
|
||||
],
|
||||
"gridPos": {"h": 8, "w": 12, "x": 0, "y": 24}
|
||||
},
|
||||
{
|
||||
"id": 8,
|
||||
"title": "Failed Posting Operations",
|
||||
"type": "graph",
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(rate(ledger_posting_errors_total[5m])) by (error_type)",
|
||||
"legendFormat": "{{error_type}}"
|
||||
}
|
||||
],
|
||||
"gridPos": {"h": 8, "w": 12, "x": 12, "y": 24}
|
||||
},
|
||||
{
|
||||
"id": 9,
|
||||
"title": "Total Ledger Entries",
|
||||
"type": "stat",
|
||||
"targets": [
|
||||
{
|
||||
"expr": "ledger_entries_count",
|
||||
"legendFormat": "Total Entries"
|
||||
}
|
||||
],
|
||||
"gridPos": {"h": 4, "w": 6, "x": 0, "y": 32}
|
||||
},
|
||||
{
|
||||
"id": 10,
|
||||
"title": "Active Accounts",
|
||||
"type": "stat",
|
||||
"targets": [
|
||||
{
|
||||
"expr": "bank_accounts_count",
|
||||
"legendFormat": "Active Accounts"
|
||||
}
|
||||
],
|
||||
"gridPos": {"h": 4, "w": 6, "x": 6, "y": 32}
|
||||
},
|
||||
{
|
||||
"id": 11,
|
||||
"title": "Pending Settlements",
|
||||
"type": "stat",
|
||||
"targets": [
|
||||
{
|
||||
"expr": "settlements_pending_count",
|
||||
"legendFormat": "Pending"
|
||||
}
|
||||
],
|
||||
"gridPos": {"h": 4, "w": 6, "x": 12, "y": 32}
|
||||
},
|
||||
{
|
||||
"id": 12,
|
||||
"title": "Successful Settlements (24h)",
|
||||
"type": "stat",
|
||||
"targets": [
|
||||
{
|
||||
"expr": "increase(settlement_total{status=\"SETTLED\"}[24h])",
|
||||
"legendFormat": "Successful"
|
||||
}
|
||||
],
|
||||
"gridPos": {"h": 4, "w": 6, "x": 18, "y": 32}
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
167
monitoring/grafana/dashboards/security-compliance.json
Normal file
167
monitoring/grafana/dashboards/security-compliance.json
Normal file
@@ -0,0 +1,167 @@
|
||||
{
|
||||
"dashboard": {
|
||||
"title": "DBIS Core - Security & Compliance",
|
||||
"tags": ["security", "compliance", "audit"],
|
||||
"timezone": "browser",
|
||||
"schemaVersion": 27,
|
||||
"version": 1,
|
||||
"refresh": "30s",
|
||||
"panels": [
|
||||
{
|
||||
"id": 1,
|
||||
"title": "Authentication Failures",
|
||||
"type": "graph",
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(rate(authentication_failures_total[5m])) by (reason)",
|
||||
"legendFormat": "{{reason}}"
|
||||
}
|
||||
],
|
||||
"gridPos": {"h": 8, "w": 12, "x": 0, "y": 0}
|
||||
},
|
||||
{
|
||||
"id": 2,
|
||||
"title": "Authorization Failures",
|
||||
"type": "graph",
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(rate(authorization_failures_total[5m])) by (resource, action)",
|
||||
"legendFormat": "{{resource}} - {{action}}"
|
||||
}
|
||||
],
|
||||
"gridPos": {"h": 8, "w": 12, "x": 12, "y": 0}
|
||||
},
|
||||
{
|
||||
"id": 3,
|
||||
"title": "Sanctions Screening Results",
|
||||
"type": "graph",
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(rate(sanctions_screening_total[5m])) by (result)",
|
||||
"legendFormat": "{{result}}"
|
||||
}
|
||||
],
|
||||
"gridPos": {"h": 8, "w": 12, "x": 0, "y": 8}
|
||||
},
|
||||
{
|
||||
"id": 4,
|
||||
"title": "AML Risk Score Distribution",
|
||||
"type": "graph",
|
||||
"targets": [
|
||||
{
|
||||
"expr": "histogram_quantile(0.50, sum(rate(aml_risk_score_bucket[5m])) by (le))",
|
||||
"legendFormat": "P50"
|
||||
},
|
||||
{
|
||||
"expr": "histogram_quantile(0.95, sum(rate(aml_risk_score_bucket[5m])) by (le))",
|
||||
"legendFormat": "P95"
|
||||
},
|
||||
{
|
||||
"expr": "histogram_quantile(0.99, sum(rate(aml_risk_score_bucket[5m])) by (le))",
|
||||
"legendFormat": "P99"
|
||||
}
|
||||
],
|
||||
"gridPos": {"h": 8, "w": 12, "x": 12, "y": 8}
|
||||
},
|
||||
{
|
||||
"id": 5,
|
||||
"title": "Audit Log Events",
|
||||
"type": "graph",
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(rate(audit_log_events_total[5m])) by (event_type)",
|
||||
"legendFormat": "{{event_type}}"
|
||||
}
|
||||
],
|
||||
"gridPos": {"h": 8, "w": 12, "x": 0, "y": 16}
|
||||
},
|
||||
{
|
||||
"id": 6,
|
||||
"title": "Policy Violations",
|
||||
"type": "graph",
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(rate(policy_violations_total[5m])) by (policy_type, violation_type)",
|
||||
"legendFormat": "{{policy_type}} - {{violation_type}}"
|
||||
}
|
||||
],
|
||||
"gridPos": {"h": 8, "w": 12, "x": 12, "y": 16}
|
||||
},
|
||||
{
|
||||
"id": 7,
|
||||
"title": "Failed Transactions by Reason",
|
||||
"type": "piechart",
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(rate(transaction_failures_total[5m])) by (reason)",
|
||||
"legendFormat": "{{reason}}"
|
||||
}
|
||||
],
|
||||
"gridPos": {"h": 8, "w": 12, "x": 0, "y": 24}
|
||||
},
|
||||
{
|
||||
"id": 8,
|
||||
"title": "Encryption Key Rotation Status",
|
||||
"type": "stat",
|
||||
"targets": [
|
||||
{
|
||||
"expr": "encryption_key_rotation_status",
|
||||
"legendFormat": "Status"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{"value": 0, "color": "red"},
|
||||
{"value": 1, "color": "green"}
|
||||
]
|
||||
}
|
||||
}
|
||||
},
|
||||
"gridPos": {"h": 4, "w": 6, "x": 12, "y": 24}
|
||||
},
|
||||
{
|
||||
"id": 9,
|
||||
"title": "Data Access Events",
|
||||
"type": "graph",
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(rate(data_access_events_total{data_type=\"PII\"}[5m])) by (operation)",
|
||||
"legendFormat": "PII - {{operation}}"
|
||||
},
|
||||
{
|
||||
"expr": "sum(rate(data_access_events_total{data_type=\"FINANCIAL\"}[5m])) by (operation)",
|
||||
"legendFormat": "Financial - {{operation}}"
|
||||
}
|
||||
],
|
||||
"gridPos": {"h": 8, "w": 12, "x": 18, "y": 24}
|
||||
},
|
||||
{
|
||||
"id": 10,
|
||||
"title": "Security Incidents (24h)",
|
||||
"type": "stat",
|
||||
"targets": [
|
||||
{
|
||||
"expr": "increase(security_incidents_total[24h])",
|
||||
"legendFormat": "Incidents"
|
||||
}
|
||||
],
|
||||
"gridPos": {"h": 4, "w": 6, "x": 0, "y": 32}
|
||||
},
|
||||
{
|
||||
"id": 11,
|
||||
"title": "Compliance Violations (24h)",
|
||||
"type": "stat",
|
||||
"targets": [
|
||||
{
|
||||
"expr": "increase(compliance_violations_total[24h])",
|
||||
"legendFormat": "Violations"
|
||||
}
|
||||
],
|
||||
"gridPos": {"h": 4, "w": 6, "x": 6, "y": 32}
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
147
monitoring/grafana/dashboards/system-health.json
Normal file
147
monitoring/grafana/dashboards/system-health.json
Normal file
@@ -0,0 +1,147 @@
|
||||
{
|
||||
"dashboard": {
|
||||
"title": "DBIS Core - System Health",
|
||||
"tags": ["system", "health", "overview"],
|
||||
"timezone": "browser",
|
||||
"schemaVersion": 27,
|
||||
"version": 1,
|
||||
"refresh": "30s",
|
||||
"panels": [
|
||||
{
|
||||
"id": 1,
|
||||
"title": "Service Health Status",
|
||||
"type": "stat",
|
||||
"targets": [
|
||||
{
|
||||
"expr": "up{job=\"dbis-core\"}",
|
||||
"legendFormat": "{{instance}}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{"value": 0, "color": "red"},
|
||||
{"value": 1, "color": "green"}
|
||||
]
|
||||
}
|
||||
}
|
||||
},
|
||||
"gridPos": {"h": 4, "w": 6, "x": 0, "y": 0}
|
||||
},
|
||||
{
|
||||
"id": 2,
|
||||
"title": "Overall System Status",
|
||||
"type": "stat",
|
||||
"targets": [
|
||||
{
|
||||
"expr": "count(up{job=\"dbis-core\"} == 1) / count(up{job=\"dbis-core\"}) * 100",
|
||||
"legendFormat": "Health %"
|
||||
}
|
||||
],
|
||||
"gridPos": {"h": 4, "w": 6, "x": 6, "y": 0}
|
||||
},
|
||||
{
|
||||
"id": 3,
|
||||
"title": "Total Error Rate",
|
||||
"type": "graph",
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(rate(http_requests_total{status=~\"5..\"}[5m]))",
|
||||
"legendFormat": "5xx Errors/sec"
|
||||
},
|
||||
{
|
||||
"expr": "sum(rate(http_requests_total{status=~\"4..\"}[5m]))",
|
||||
"legendFormat": "4xx Errors/sec"
|
||||
}
|
||||
],
|
||||
"gridPos": {"h": 8, "w": 12, "x": 12, "y": 0}
|
||||
},
|
||||
{
|
||||
"id": 4,
|
||||
"title": "CPU Usage by Service",
|
||||
"type": "graph",
|
||||
"targets": [
|
||||
{
|
||||
"expr": "rate(process_cpu_seconds_total{job=\"dbis-core\"}[5m]) * 100",
|
||||
"legendFormat": "{{instance}} - {{service}}"
|
||||
}
|
||||
],
|
||||
"gridPos": {"h": 8, "w": 12, "x": 0, "y": 8}
|
||||
},
|
||||
{
|
||||
"id": 5,
|
||||
"title": "Memory Usage by Service",
|
||||
"type": "graph",
|
||||
"targets": [
|
||||
{
|
||||
"expr": "process_resident_memory_bytes{job=\"dbis-core\"} / 1024 / 1024",
|
||||
"legendFormat": "{{instance}} - {{service}} (MB)"
|
||||
}
|
||||
],
|
||||
"gridPos": {"h": 8, "w": 12, "x": 12, "y": 8}
|
||||
},
|
||||
{
|
||||
"id": 6,
|
||||
"title": "Database Connection Pool",
|
||||
"type": "graph",
|
||||
"targets": [
|
||||
{
|
||||
"expr": "db_pool_size{job=\"dbis-core\"}",
|
||||
"legendFormat": "Pool Size"
|
||||
},
|
||||
{
|
||||
"expr": "db_pool_active{job=\"dbis-core\"}",
|
||||
"legendFormat": "Active Connections"
|
||||
},
|
||||
{
|
||||
"expr": "db_pool_idle{job=\"dbis-core\"}",
|
||||
"legendFormat": "Idle Connections"
|
||||
}
|
||||
],
|
||||
"gridPos": {"h": 8, "w": 12, "x": 0, "y": 16}
|
||||
},
|
||||
{
|
||||
"id": 7,
|
||||
"title": "Request Rate by Service",
|
||||
"type": "graph",
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(rate(http_requests_total{job=\"dbis-core\"}[5m])) by (service)",
|
||||
"legendFormat": "{{service}}"
|
||||
}
|
||||
],
|
||||
"gridPos": {"h": 8, "w": 12, "x": 12, "y": 16}
|
||||
},
|
||||
{
|
||||
"id": 8,
|
||||
"title": "Active Sessions",
|
||||
"type": "stat",
|
||||
"targets": [
|
||||
{
|
||||
"expr": "dbis_sessions_active",
|
||||
"legendFormat": "Active"
|
||||
}
|
||||
],
|
||||
"gridPos": {"h": 4, "w": 6, "x": 0, "y": 24}
|
||||
},
|
||||
{
|
||||
"id": 9,
|
||||
"title": "Queue Length",
|
||||
"type": "graph",
|
||||
"targets": [
|
||||
{
|
||||
"expr": "dbis_queue_length{queue=\"dual_ledger_outbox\"}",
|
||||
"legendFormat": "Outbox Queue"
|
||||
},
|
||||
{
|
||||
"expr": "dbis_queue_length{queue=\"settlement\"}",
|
||||
"legendFormat": "Settlement Queue"
|
||||
}
|
||||
],
|
||||
"gridPos": {"h": 8, "w": 18, "x": 6, "y": 24}
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
10
monitoring/prometheus-as4.yml
Normal file
10
monitoring/prometheus-as4.yml
Normal file
@@ -0,0 +1,10 @@
|
||||
# Prometheus Configuration for AS4 Settlement
|
||||
# Add this to your main prometheus.yml
|
||||
|
||||
scrape_configs:
|
||||
- job_name: 'as4-settlement'
|
||||
static_configs:
|
||||
- targets: ['localhost:3000']
|
||||
metrics_path: '/api/v1/as4/metrics'
|
||||
scrape_interval: 15s
|
||||
scrape_timeout: 10s
|
||||
32
monitoring/prometheus.yml
Normal file
32
monitoring/prometheus.yml
Normal file
@@ -0,0 +1,32 @@
|
||||
# Prometheus configuration for SolaceNet monitoring
|
||||
|
||||
global:
|
||||
scrape_interval: 15s
|
||||
evaluation_interval: 15s
|
||||
|
||||
scrape_configs:
|
||||
# DBIS API metrics
|
||||
- job_name: 'dbis-api'
|
||||
static_configs:
|
||||
- targets: ['dbis-api:3000']
|
||||
metrics_path: '/metrics'
|
||||
|
||||
# SolaceNet Gateway metrics
|
||||
- job_name: 'solacenet-gateway'
|
||||
static_configs:
|
||||
- targets: ['solacenet-gateway:8080']
|
||||
metrics_path: '/metrics'
|
||||
|
||||
# Redis metrics (if using redis_exporter)
|
||||
- job_name: 'redis'
|
||||
static_configs:
|
||||
- targets: ['redis-exporter:9121']
|
||||
|
||||
rule_files:
|
||||
- 'alerts.yml'
|
||||
|
||||
alerting:
|
||||
alertmanagers:
|
||||
- static_configs:
|
||||
- targets:
|
||||
- alertmanager:9093
|
||||
Reference in New Issue
Block a user