chore: sync submodule state (parent ref update)

Made-with: Cursor
This commit is contained in:
defiQUG
2026-03-02 12:14:07 -08:00
parent 6c4555cebd
commit 89b82cdadb
883 changed files with 78752 additions and 18180 deletions

73
monitoring/alerts.yml Normal file
View File

@@ -0,0 +1,73 @@
# Prometheus alerting rules for SolaceNet
groups:
- name: solacenet_capabilities
interval: 30s
rules:
- alert: CapabilityDisabled
expr: solacenet_capability_state{state="disabled"} > 0
for: 5m
labels:
severity: warning
annotations:
summary: "Capability {{ $labels.capability_id }} is disabled"
description: "Capability {{ $labels.capability_id }} has been disabled for {{ $labels.tenant_id }}"
- alert: KillSwitchActivated
expr: increase(solacenet_kill_switch_activations_total[5m]) > 0
labels:
severity: critical
annotations:
summary: "Kill switch activated for {{ $labels.capability_id }}"
description: "Emergency kill switch was activated for capability {{ $labels.capability_id }}"
- alert: HighPolicyDecisionLatency
expr: histogram_quantile(0.95, solacenet_policy_decision_duration_seconds_bucket) > 1
for: 5m
labels:
severity: warning
annotations:
summary: "High policy decision latency"
description: "95th percentile policy decision latency is {{ $value }}s"
- name: solacenet_risk
interval: 30s
rules:
- alert: HighRiskScore
expr: solacenet_risk_score > 80
for: 2m
labels:
severity: warning
annotations:
summary: "High risk score detected"
description: "Risk score of {{ $value }} detected for transaction {{ $labels.transaction_id }}"
- alert: RiskEngineDown
expr: up{job="risk-engine"} == 0
for: 1m
labels:
severity: critical
annotations:
summary: "Risk engine is down"
description: "Risk rules engine is not responding"
- name: solacenet_infrastructure
interval: 30s
rules:
- alert: RedisDown
expr: up{job="redis"} == 0
for: 1m
labels:
severity: critical
annotations:
summary: "Redis is down"
description: "Redis cache is not available, policy decisions will not be cached"
- alert: GatewayDown
expr: up{job="solacenet-gateway"} == 0
for: 1m
labels:
severity: critical
annotations:
summary: "SolaceNet Gateway is down"
description: "The SolaceNet API Gateway is not responding"

85
monitoring/as4-alerts.yml Normal file
View File

@@ -0,0 +1,85 @@
# Prometheus Alerting Rules for AS4 Settlement
groups:
- name: as4_settlement
interval: 30s
rules:
# High Latency Alert
- alert: AS4HighLatency
expr: as4_message_latency_p99 > 5
for: 5m
labels:
severity: warning
annotations:
summary: "AS4 message processing latency is high"
description: "P99 latency is {{ $value }}s (threshold: 5s)"
# High Failure Rate Alert
- alert: AS4HighFailureRate
expr: rate(as4_instructions_failed[5m]) > 0.01
for: 5m
labels:
severity: critical
annotations:
summary: "AS4 instruction failure rate is high"
description: "Failure rate is {{ $value }} (threshold: 1%)"
# Certificate Expiring Alert
- alert: AS4CertificateExpiring
expr: as4_certificate_days_until_expiry < 30
for: 1h
labels:
severity: warning
annotations:
summary: "AS4 certificate expiring soon"
description: "Certificate expires in {{ $value }} days"
# System Unavailable Alert
- alert: AS4SystemUnavailable
expr: up{job="as4-settlement"} == 0
for: 1m
labels:
severity: critical
annotations:
summary: "AS4 Settlement system is down"
description: "AS4 service is not responding"
# Database Connection Alert
- alert: AS4DatabaseConnectionFailed
expr: as4_database_connection_status == 0
for: 1m
labels:
severity: critical
annotations:
summary: "AS4 database connection failed"
description: "Cannot connect to database"
# Redis Connection Alert
- alert: AS4RedisConnectionFailed
expr: as4_redis_connection_status == 0
for: 1m
labels:
severity: warning
annotations:
summary: "AS4 Redis connection failed"
description: "Cannot connect to Redis (nonce tracking may be affected)"
# High Memory Usage Alert
- alert: AS4HighMemoryUsage
expr: as4_memory_usage_percent > 80
for: 5m
labels:
severity: warning
annotations:
summary: "AS4 system memory usage is high"
description: "Memory usage is {{ $value }}%"
# Queue Backlog Alert
- alert: AS4QueueBacklog
expr: as4_instruction_queue_length > 1000
for: 5m
labels:
severity: warning
annotations:
summary: "AS4 instruction queue backlog"
description: "Queue length is {{ $value }} instructions"

View File

@@ -0,0 +1,234 @@
# Grafana Dashboards
This directory contains Grafana dashboard JSON files for monitoring the DBIS Core Banking System.
## Dashboard List
### 1. System Health Dashboard (`system-health.json`)
**Purpose**: Overall system health and status monitoring
**Key Metrics**:
- Service health status
- Overall system availability
- Error rates (5xx, 4xx)
- CPU and memory usage by service
- Database connection pool status
- Active sessions
- Queue lengths
**Refresh Interval**: 30s
**Tags**: `system`, `health`, `overview`
---
### 2. API Performance Dashboard (`api-performance.json`)
**Purpose**: API endpoint performance and latency monitoring
**Key Metrics**:
- Request rate by endpoint
- Response time percentiles (P50, P95, P99)
- Error rate by endpoint
- Top endpoints by request volume
- Request distribution by method and status code
- SLO compliance (availability, latency)
- Request duration distribution
**Refresh Interval**: 30s
**Tags**: `api`, `performance`, `latency`
---
### 3. Ledger Operations Dashboard (`ledger-operations.json`)
**Purpose**: Ledger entry and settlement operations monitoring
**Key Metrics**:
- Ledger entry rate by ledger ID
- Ledger entry amount by ledger and currency
- Settlement rate by status
- Settlement duration percentiles
- Outbox queue status and processing rate
- Balance updates by currency
- Failed posting operations
- Total ledger entries, active accounts, pending settlements
**Refresh Interval**: 30s
**Tags**: `ledger`, `transactions`, `settlement`
---
### 4. Security & Compliance Dashboard (`security-compliance.json`)
**Purpose**: Security events and compliance monitoring
**Key Metrics**:
- Authentication failures by reason
- Authorization failures by resource and action
- Sanctions screening results
- AML risk score distribution
- Audit log events by type
- Policy violations by type
- Failed transactions by reason
- Encryption key rotation status
- Data access events (PII, Financial)
- Security incidents and compliance violations (24h)
**Refresh Interval**: 30s
**Tags**: `security`, `compliance`, `audit`
---
## Installation
### Import Dashboards to Grafana
1. **Via Grafana UI**:
- Navigate to Grafana → Dashboards → Import
- Upload the JSON file or paste JSON content
- Configure data source and settings
- Save dashboard
2. **Via Grafana Provisioning**:
Create a provisioning configuration file:
```yaml
# grafana/provisioning/dashboards/dashboards.yml
apiVersion: 1
providers:
- name: 'DBIS Core Dashboards'
orgId: 1
folder: 'DBIS Core'
type: file
disableDeletion: false
updateIntervalSeconds: 10
allowUiUpdates: true
options:
path: /etc/grafana/dashboards
```
Copy dashboard files to the provisioned path:
```bash
cp dbis_core/monitoring/grafana/dashboards/*.json /etc/grafana/dashboards/
```
3. **Via Grafana API**:
```bash
# Import dashboard via API
curl -X POST \
-H "Content-Type: application/json" \
-H "Authorization: Bearer <grafana-api-key>" \
-d @system-health.json \
http://grafana:3000/api/dashboards/db
```
---
## Configuration
### Data Source Configuration
Ensure Prometheus data source is configured in Grafana:
1. Navigate to Configuration → Data Sources
2. Add Prometheus data source
3. Set URL: `http://prometheus:9090`
4. Configure scrape interval and timeouts
### Variable Configuration
Some dashboards may use variables for filtering:
- `$datasource`: Prometheus data source
- `$service`: Service name filter (optional)
- `$environment`: Environment filter (optional)
---
## Metrics Requirements
### Prometheus Metrics
These dashboards expect the following Prometheus metrics to be exported:
#### System Metrics
- `up{job="dbis-core"}`
- `process_cpu_seconds_total{job="dbis-core"}`
- `process_resident_memory_bytes{job="dbis-core"}`
- `db_pool_size{job="dbis-core"}`
- `db_pool_active{job="dbis-core"}`
- `db_pool_idle{job="dbis-core"}`
#### API Metrics
- `http_requests_total{job="dbis-core",endpoint,method,status}`
- `http_request_duration_seconds_bucket{job="dbis-core",endpoint,le}`
#### Ledger Metrics
- `ledger_entries_total{ledger_id}`
- `ledger_entry_amount_total{ledger_id,currency_code}`
- `settlement_total{status}`
- `settlement_duration_seconds_bucket{le}`
- `dbis_outbox_queue_length`
- `outbox_processed_total{status}`
- `balance_updates_total{currency_code}`
- `ledger_posting_errors_total{error_type}`
#### Security Metrics
- `authentication_failures_total{reason}`
- `authorization_failures_total{resource,action}`
- `sanctions_screening_total{result}`
- `aml_risk_score_bucket{le}`
- `audit_log_events_total{event_type}`
- `policy_violations_total{policy_type,violation_type}`
- `transaction_failures_total{reason}`
- `data_access_events_total{data_type,operation}`
- `security_incidents_total`
- `compliance_violations_total`
---
## Alerting
### Recommended Alerts
Based on these dashboards, configure alerts for:
1. **System Health**:
- Service down (`up{job="dbis-core"} == 0`)
- High error rate (`rate(http_requests_total{status=~"5.."}[5m]) > 0.05`)
- High memory usage (`process_resident_memory_bytes > 8GB`)
- Database connection pool exhausted (`db_pool_active >= db_pool_size * 0.9`)
2. **API Performance**:
- P95 latency > 500ms
- Availability < 99.9%
- Error rate > 0.1%
3. **Ledger Operations**:
- Outbox queue length > 1000
- Settlement failure rate > 1%
- Failed posting operations > 10/min
4. **Security & Compliance**:
- Authentication failure rate > 5%
- Sanctions match detected
- AML risk score > 80
- Security incident detected
- Compliance violation detected
---
## References
- Metrics Specification: `explorer-monorepo/docs/specs/observability/metrics-monitoring.md`
- Tracing Dashboard: `smom-dbis-138/monitoring/grafana/dashboards/tracing.json`
- OpenTelemetry Configuration: `smom-dbis-138/monitoring/opentelemetry/otel-collector.yaml`

View File

@@ -0,0 +1,158 @@
{
"dashboard": {
"title": "DBIS Core - API Performance",
"tags": ["api", "performance", "latency"],
"timezone": "browser",
"schemaVersion": 27,
"version": 1,
"refresh": "30s",
"panels": [
{
"id": 1,
"title": "Request Rate",
"type": "graph",
"targets": [
{
"expr": "sum(rate(http_requests_total{job=\"dbis-core\"}[5m])) by (endpoint)",
"legendFormat": "{{endpoint}}"
}
],
"gridPos": {"h": 8, "w": 12, "x": 0, "y": 0}
},
{
"id": 2,
"title": "Response Time Percentiles",
"type": "graph",
"targets": [
{
"expr": "histogram_quantile(0.50, sum(rate(http_request_duration_seconds_bucket{job=\"dbis-core\"}[5m])) by (le, endpoint))",
"legendFormat": "{{endpoint}} - P50"
},
{
"expr": "histogram_quantile(0.95, sum(rate(http_request_duration_seconds_bucket{job=\"dbis-core\"}[5m])) by (le, endpoint))",
"legendFormat": "{{endpoint}} - P95"
},
{
"expr": "histogram_quantile(0.99, sum(rate(http_request_duration_seconds_bucket{job=\"dbis-core\"}[5m])) by (le, endpoint))",
"legendFormat": "{{endpoint}} - P99"
}
],
"gridPos": {"h": 8, "w": 12, "x": 12, "y": 0}
},
{
"id": 3,
"title": "Error Rate by Endpoint",
"type": "graph",
"targets": [
{
"expr": "sum(rate(http_requests_total{job=\"dbis-core\",status=~\"5..\"}[5m])) by (endpoint)",
"legendFormat": "{{endpoint}} - 5xx"
},
{
"expr": "sum(rate(http_requests_total{job=\"dbis-core\",status=~\"4..\"}[5m])) by (endpoint)",
"legendFormat": "{{endpoint}} - 4xx"
}
],
"gridPos": {"h": 8, "w": 12, "x": 0, "y": 8}
},
{
"id": 4,
"title": "Top Endpoints by Request Volume",
"type": "bargraph",
"targets": [
{
"expr": "topk(10, sum(rate(http_requests_total{job=\"dbis-core\"}[5m])) by (endpoint))",
"legendFormat": "{{endpoint}}"
}
],
"gridPos": {"h": 8, "w": 12, "x": 12, "y": 8}
},
{
"id": 5,
"title": "Request Rate by Method",
"type": "piechart",
"targets": [
{
"expr": "sum(rate(http_requests_total{job=\"dbis-core\"}[5m])) by (method)",
"legendFormat": "{{method}}"
}
],
"gridPos": {"h": 8, "w": 8, "x": 0, "y": 16}
},
{
"id": 6,
"title": "Request Rate by Status Code",
"type": "piechart",
"targets": [
{
"expr": "sum(rate(http_requests_total{job=\"dbis-core\"}[5m])) by (status)",
"legendFormat": "{{status}}"
}
],
"gridPos": {"h": 8, "w": 8, "x": 8, "y": 16}
},
{
"id": 7,
"title": "SLO Compliance - Availability",
"type": "stat",
"targets": [
{
"expr": "(1 - (sum(rate(http_requests_total{job=\"dbis-core\",status=~\"5..\"}[5m])) / sum(rate(http_requests_total{job=\"dbis-core\"}[5m])))) * 100",
"legendFormat": "Availability %"
}
],
"fieldConfig": {
"defaults": {
"thresholds": {
"mode": "absolute",
"steps": [
{"value": 0, "color": "red"},
{"value": 99.9, "color": "yellow"},
{"value": 99.99, "color": "green"}
]
},
"unit": "percent"
}
},
"gridPos": {"h": 4, "w": 4, "x": 16, "y": 16}
},
{
"id": 8,
"title": "SLO Compliance - P95 Latency",
"type": "stat",
"targets": [
{
"expr": "histogram_quantile(0.95, sum(rate(http_request_duration_seconds_bucket{job=\"dbis-core\"}[5m])) by (le))",
"legendFormat": "P95 Latency"
}
],
"fieldConfig": {
"defaults": {
"thresholds": {
"mode": "absolute",
"steps": [
{"value": 0, "color": "green"},
{"value": 0.5, "color": "yellow"},
{"value": 1.0, "color": "red"}
]
},
"unit": "s"
}
},
"gridPos": {"h": 4, "w": 4, "x": 20, "y": 16}
},
{
"id": 9,
"title": "Request Duration Distribution",
"type": "heatmap",
"targets": [
{
"expr": "sum(rate(http_request_duration_seconds_bucket{job=\"dbis-core\"}[5m])) by (le)",
"legendFormat": "{{le}}"
}
],
"gridPos": {"h": 8, "w": 24, "x": 0, "y": 24}
}
]
}
}

View File

@@ -0,0 +1,164 @@
{
"dashboard": {
"title": "DBIS Core - Ledger Operations",
"tags": ["ledger", "transactions", "settlement"],
"timezone": "browser",
"schemaVersion": 27,
"version": 1,
"refresh": "30s",
"panels": [
{
"id": 1,
"title": "Ledger Entry Rate",
"type": "graph",
"targets": [
{
"expr": "sum(rate(ledger_entries_total[5m])) by (ledger_id)",
"legendFormat": "{{ledger_id}}"
}
],
"gridPos": {"h": 8, "w": 12, "x": 0, "y": 0}
},
{
"id": 2,
"title": "Ledger Entry Amount by Ledger",
"type": "graph",
"targets": [
{
"expr": "sum(rate(ledger_entry_amount_total[5m])) by (ledger_id, currency_code)",
"legendFormat": "{{ledger_id}} - {{currency_code}}"
}
],
"gridPos": {"h": 8, "w": 12, "x": 12, "y": 0}
},
{
"id": 3,
"title": "Settlement Rate",
"type": "graph",
"targets": [
{
"expr": "sum(rate(settlement_total[5m])) by (status)",
"legendFormat": "{{status}}"
}
],
"gridPos": {"h": 8, "w": 12, "x": 0, "y": 8}
},
{
"id": 4,
"title": "Settlement Duration",
"type": "graph",
"targets": [
{
"expr": "histogram_quantile(0.50, sum(rate(settlement_duration_seconds_bucket[5m])) by (le))",
"legendFormat": "P50"
},
{
"expr": "histogram_quantile(0.95, sum(rate(settlement_duration_seconds_bucket[5m])) by (le))",
"legendFormat": "P95"
},
{
"expr": "histogram_quantile(0.99, sum(rate(settlement_duration_seconds_bucket[5m])) by (le))",
"legendFormat": "P99"
}
],
"gridPos": {"h": 8, "w": 12, "x": 12, "y": 8}
},
{
"id": 5,
"title": "Outbox Queue Status",
"type": "graph",
"targets": [
{
"expr": "dbis_outbox_queue_length",
"legendFormat": "Queue Length"
}
],
"gridPos": {"h": 8, "w": 12, "x": 0, "y": 16}
},
{
"id": 6,
"title": "Outbox Processing Rate",
"type": "graph",
"targets": [
{
"expr": "sum(rate(outbox_processed_total[5m])) by (status)",
"legendFormat": "{{status}}"
}
],
"gridPos": {"h": 8, "w": 12, "x": 12, "y": 16}
},
{
"id": 7,
"title": "Balance Updates",
"type": "graph",
"targets": [
{
"expr": "sum(rate(balance_updates_total[5m])) by (currency_code)",
"legendFormat": "{{currency_code}}"
}
],
"gridPos": {"h": 8, "w": 12, "x": 0, "y": 24}
},
{
"id": 8,
"title": "Failed Posting Operations",
"type": "graph",
"targets": [
{
"expr": "sum(rate(ledger_posting_errors_total[5m])) by (error_type)",
"legendFormat": "{{error_type}}"
}
],
"gridPos": {"h": 8, "w": 12, "x": 12, "y": 24}
},
{
"id": 9,
"title": "Total Ledger Entries",
"type": "stat",
"targets": [
{
"expr": "ledger_entries_count",
"legendFormat": "Total Entries"
}
],
"gridPos": {"h": 4, "w": 6, "x": 0, "y": 32}
},
{
"id": 10,
"title": "Active Accounts",
"type": "stat",
"targets": [
{
"expr": "bank_accounts_count",
"legendFormat": "Active Accounts"
}
],
"gridPos": {"h": 4, "w": 6, "x": 6, "y": 32}
},
{
"id": 11,
"title": "Pending Settlements",
"type": "stat",
"targets": [
{
"expr": "settlements_pending_count",
"legendFormat": "Pending"
}
],
"gridPos": {"h": 4, "w": 6, "x": 12, "y": 32}
},
{
"id": 12,
"title": "Successful Settlements (24h)",
"type": "stat",
"targets": [
{
"expr": "increase(settlement_total{status=\"SETTLED\"}[24h])",
"legendFormat": "Successful"
}
],
"gridPos": {"h": 4, "w": 6, "x": 18, "y": 32}
}
]
}
}

View File

@@ -0,0 +1,167 @@
{
"dashboard": {
"title": "DBIS Core - Security & Compliance",
"tags": ["security", "compliance", "audit"],
"timezone": "browser",
"schemaVersion": 27,
"version": 1,
"refresh": "30s",
"panels": [
{
"id": 1,
"title": "Authentication Failures",
"type": "graph",
"targets": [
{
"expr": "sum(rate(authentication_failures_total[5m])) by (reason)",
"legendFormat": "{{reason}}"
}
],
"gridPos": {"h": 8, "w": 12, "x": 0, "y": 0}
},
{
"id": 2,
"title": "Authorization Failures",
"type": "graph",
"targets": [
{
"expr": "sum(rate(authorization_failures_total[5m])) by (resource, action)",
"legendFormat": "{{resource}} - {{action}}"
}
],
"gridPos": {"h": 8, "w": 12, "x": 12, "y": 0}
},
{
"id": 3,
"title": "Sanctions Screening Results",
"type": "graph",
"targets": [
{
"expr": "sum(rate(sanctions_screening_total[5m])) by (result)",
"legendFormat": "{{result}}"
}
],
"gridPos": {"h": 8, "w": 12, "x": 0, "y": 8}
},
{
"id": 4,
"title": "AML Risk Score Distribution",
"type": "graph",
"targets": [
{
"expr": "histogram_quantile(0.50, sum(rate(aml_risk_score_bucket[5m])) by (le))",
"legendFormat": "P50"
},
{
"expr": "histogram_quantile(0.95, sum(rate(aml_risk_score_bucket[5m])) by (le))",
"legendFormat": "P95"
},
{
"expr": "histogram_quantile(0.99, sum(rate(aml_risk_score_bucket[5m])) by (le))",
"legendFormat": "P99"
}
],
"gridPos": {"h": 8, "w": 12, "x": 12, "y": 8}
},
{
"id": 5,
"title": "Audit Log Events",
"type": "graph",
"targets": [
{
"expr": "sum(rate(audit_log_events_total[5m])) by (event_type)",
"legendFormat": "{{event_type}}"
}
],
"gridPos": {"h": 8, "w": 12, "x": 0, "y": 16}
},
{
"id": 6,
"title": "Policy Violations",
"type": "graph",
"targets": [
{
"expr": "sum(rate(policy_violations_total[5m])) by (policy_type, violation_type)",
"legendFormat": "{{policy_type}} - {{violation_type}}"
}
],
"gridPos": {"h": 8, "w": 12, "x": 12, "y": 16}
},
{
"id": 7,
"title": "Failed Transactions by Reason",
"type": "piechart",
"targets": [
{
"expr": "sum(rate(transaction_failures_total[5m])) by (reason)",
"legendFormat": "{{reason}}"
}
],
"gridPos": {"h": 8, "w": 12, "x": 0, "y": 24}
},
{
"id": 8,
"title": "Encryption Key Rotation Status",
"type": "stat",
"targets": [
{
"expr": "encryption_key_rotation_status",
"legendFormat": "Status"
}
],
"fieldConfig": {
"defaults": {
"thresholds": {
"mode": "absolute",
"steps": [
{"value": 0, "color": "red"},
{"value": 1, "color": "green"}
]
}
}
},
"gridPos": {"h": 4, "w": 6, "x": 12, "y": 24}
},
{
"id": 9,
"title": "Data Access Events",
"type": "graph",
"targets": [
{
"expr": "sum(rate(data_access_events_total{data_type=\"PII\"}[5m])) by (operation)",
"legendFormat": "PII - {{operation}}"
},
{
"expr": "sum(rate(data_access_events_total{data_type=\"FINANCIAL\"}[5m])) by (operation)",
"legendFormat": "Financial - {{operation}}"
}
],
"gridPos": {"h": 8, "w": 12, "x": 18, "y": 24}
},
{
"id": 10,
"title": "Security Incidents (24h)",
"type": "stat",
"targets": [
{
"expr": "increase(security_incidents_total[24h])",
"legendFormat": "Incidents"
}
],
"gridPos": {"h": 4, "w": 6, "x": 0, "y": 32}
},
{
"id": 11,
"title": "Compliance Violations (24h)",
"type": "stat",
"targets": [
{
"expr": "increase(compliance_violations_total[24h])",
"legendFormat": "Violations"
}
],
"gridPos": {"h": 4, "w": 6, "x": 6, "y": 32}
}
]
}
}

View File

@@ -0,0 +1,147 @@
{
"dashboard": {
"title": "DBIS Core - System Health",
"tags": ["system", "health", "overview"],
"timezone": "browser",
"schemaVersion": 27,
"version": 1,
"refresh": "30s",
"panels": [
{
"id": 1,
"title": "Service Health Status",
"type": "stat",
"targets": [
{
"expr": "up{job=\"dbis-core\"}",
"legendFormat": "{{instance}}"
}
],
"fieldConfig": {
"defaults": {
"thresholds": {
"mode": "absolute",
"steps": [
{"value": 0, "color": "red"},
{"value": 1, "color": "green"}
]
}
}
},
"gridPos": {"h": 4, "w": 6, "x": 0, "y": 0}
},
{
"id": 2,
"title": "Overall System Status",
"type": "stat",
"targets": [
{
"expr": "count(up{job=\"dbis-core\"} == 1) / count(up{job=\"dbis-core\"}) * 100",
"legendFormat": "Health %"
}
],
"gridPos": {"h": 4, "w": 6, "x": 6, "y": 0}
},
{
"id": 3,
"title": "Total Error Rate",
"type": "graph",
"targets": [
{
"expr": "sum(rate(http_requests_total{status=~\"5..\"}[5m]))",
"legendFormat": "5xx Errors/sec"
},
{
"expr": "sum(rate(http_requests_total{status=~\"4..\"}[5m]))",
"legendFormat": "4xx Errors/sec"
}
],
"gridPos": {"h": 8, "w": 12, "x": 12, "y": 0}
},
{
"id": 4,
"title": "CPU Usage by Service",
"type": "graph",
"targets": [
{
"expr": "rate(process_cpu_seconds_total{job=\"dbis-core\"}[5m]) * 100",
"legendFormat": "{{instance}} - {{service}}"
}
],
"gridPos": {"h": 8, "w": 12, "x": 0, "y": 8}
},
{
"id": 5,
"title": "Memory Usage by Service",
"type": "graph",
"targets": [
{
"expr": "process_resident_memory_bytes{job=\"dbis-core\"} / 1024 / 1024",
"legendFormat": "{{instance}} - {{service}} (MB)"
}
],
"gridPos": {"h": 8, "w": 12, "x": 12, "y": 8}
},
{
"id": 6,
"title": "Database Connection Pool",
"type": "graph",
"targets": [
{
"expr": "db_pool_size{job=\"dbis-core\"}",
"legendFormat": "Pool Size"
},
{
"expr": "db_pool_active{job=\"dbis-core\"}",
"legendFormat": "Active Connections"
},
{
"expr": "db_pool_idle{job=\"dbis-core\"}",
"legendFormat": "Idle Connections"
}
],
"gridPos": {"h": 8, "w": 12, "x": 0, "y": 16}
},
{
"id": 7,
"title": "Request Rate by Service",
"type": "graph",
"targets": [
{
"expr": "sum(rate(http_requests_total{job=\"dbis-core\"}[5m])) by (service)",
"legendFormat": "{{service}}"
}
],
"gridPos": {"h": 8, "w": 12, "x": 12, "y": 16}
},
{
"id": 8,
"title": "Active Sessions",
"type": "stat",
"targets": [
{
"expr": "dbis_sessions_active",
"legendFormat": "Active"
}
],
"gridPos": {"h": 4, "w": 6, "x": 0, "y": 24}
},
{
"id": 9,
"title": "Queue Length",
"type": "graph",
"targets": [
{
"expr": "dbis_queue_length{queue=\"dual_ledger_outbox\"}",
"legendFormat": "Outbox Queue"
},
{
"expr": "dbis_queue_length{queue=\"settlement\"}",
"legendFormat": "Settlement Queue"
}
],
"gridPos": {"h": 8, "w": 18, "x": 6, "y": 24}
}
]
}
}

View File

@@ -0,0 +1,10 @@
# Prometheus Configuration for AS4 Settlement
# Add this to your main prometheus.yml
scrape_configs:
- job_name: 'as4-settlement'
static_configs:
- targets: ['localhost:3000']
metrics_path: '/api/v1/as4/metrics'
scrape_interval: 15s
scrape_timeout: 10s

32
monitoring/prometheus.yml Normal file
View File

@@ -0,0 +1,32 @@
# Prometheus configuration for SolaceNet monitoring
global:
scrape_interval: 15s
evaluation_interval: 15s
scrape_configs:
# DBIS API metrics
- job_name: 'dbis-api'
static_configs:
- targets: ['dbis-api:3000']
metrics_path: '/metrics'
# SolaceNet Gateway metrics
- job_name: 'solacenet-gateway'
static_configs:
- targets: ['solacenet-gateway:8080']
metrics_path: '/metrics'
# Redis metrics (if using redis_exporter)
- job_name: 'redis'
static_configs:
- targets: ['redis-exporter:9121']
rule_files:
- 'alerts.yml'
alerting:
alertmanagers:
- static_configs:
- targets:
- alertmanager:9093