Refactor code for improved readability and performance
This commit is contained in:
31
scripts/monitoring/prometheus-besu-config.yml
Normal file
31
scripts/monitoring/prometheus-besu-config.yml
Normal file
@@ -0,0 +1,31 @@
|
||||
# Prometheus Configuration for Besu Metrics
|
||||
# Add this to your prometheus.yml scrape_configs section
|
||||
|
||||
scrape_configs:
|
||||
- job_name: 'besu'
|
||||
scrape_interval: 15s
|
||||
static_configs:
|
||||
# Validators (VMID 1000-1004) - metrics enabled but may not expose RPC
|
||||
- targets:
|
||||
- '192.168.11.100:9545' # validator-1 (DHCP assigned)
|
||||
- '192.168.11.101:9545' # validator-2 (DHCP assigned)
|
||||
- '192.168.11.102:9545' # validator-3 (DHCP assigned)
|
||||
- '192.168.11.103:9545' # validator-4 (DHCP assigned)
|
||||
- '192.168.11.104:9545' # validator-5 (DHCP assigned)
|
||||
labels:
|
||||
role: 'validator'
|
||||
# Sentries (VMID 1500-1503)
|
||||
- targets:
|
||||
- '192.168.11.150:9545' # sentry-1 (DHCP assigned)
|
||||
- '192.168.11.151:9545' # sentry-2 (DHCP assigned)
|
||||
- '192.168.11.152:9545' # sentry-3 (DHCP assigned)
|
||||
- '192.168.11.153:9545' # sentry-4 (DHCP assigned)
|
||||
labels:
|
||||
role: 'sentry'
|
||||
# RPC Nodes (VMID 2500-2502)
|
||||
- targets:
|
||||
- '192.168.11.250:9545' # rpc-1 (DHCP assigned)
|
||||
- '192.168.11.251:9545' # rpc-2 (DHCP assigned)
|
||||
- '192.168.11.252:9545' # rpc-3 (DHCP assigned)
|
||||
labels:
|
||||
role: 'rpc'
|
||||
51
scripts/monitoring/setup-health-check-cron.sh
Executable file
51
scripts/monitoring/setup-health-check-cron.sh
Executable file
@@ -0,0 +1,51 @@
|
||||
#!/bin/bash
|
||||
# Setup Health Check Cron Job
|
||||
# Installs cron jobs to monitor Besu node health
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
PROJECT_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)"
|
||||
|
||||
if ! command -v pct >/dev/null 2>&1; then
|
||||
echo "Error: pct command not found. This script must be run on Proxmox host."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
LOG_DIR="$PROJECT_ROOT/logs/health-checks"
|
||||
mkdir -p "$LOG_DIR"
|
||||
|
||||
# Create cron job script
|
||||
cat > "$PROJECT_ROOT/scripts/monitoring/health-check-cron-wrapper.sh" << 'CRONSCRIPT'
|
||||
#!/bin/bash
|
||||
# Health check wrapper for cron
|
||||
# Checks all Besu nodes and logs results
|
||||
|
||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
PROJECT_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)"
|
||||
LOG_DIR="$PROJECT_ROOT/logs/health-checks"
|
||||
TIMESTAMP=$(date +%Y%m%d-%H%M%S)
|
||||
|
||||
for vmid in 1000 1001 1002 1003 1004 1500 1501 1502 1503 2500 2501 2502; do
|
||||
if [[ -f "$PROJECT_ROOT/scripts/health/check-node-health.sh" ]]; then
|
||||
"$PROJECT_ROOT/scripts/health/check-node-health.sh" "$vmid" >> "$LOG_DIR/health-$vmid-$TIMESTAMP.log" 2>&1
|
||||
fi
|
||||
done
|
||||
|
||||
# Cleanup old logs (keep 7 days)
|
||||
find "$LOG_DIR" -name "health-*.log" -mtime +7 -delete 2>/dev/null || true
|
||||
CRONSCRIPT
|
||||
|
||||
chmod +x "$PROJECT_ROOT/scripts/monitoring/health-check-cron-wrapper.sh"
|
||||
|
||||
# Add to crontab (every 5 minutes)
|
||||
CRON_JOB="*/5 * * * * $PROJECT_ROOT/scripts/monitoring/health-check-cron-wrapper.sh"
|
||||
|
||||
if crontab -l 2>/dev/null | grep -q "health-check-cron-wrapper.sh"; then
|
||||
echo "Cron job already exists"
|
||||
else
|
||||
(crontab -l 2>/dev/null; echo "$CRON_JOB") | crontab -
|
||||
echo "✓ Health check cron job installed (runs every 5 minutes)"
|
||||
echo " Logs: $LOG_DIR/"
|
||||
echo " To remove: crontab -e (then delete the line)"
|
||||
fi
|
||||
73
scripts/monitoring/simple-alert.sh
Executable file
73
scripts/monitoring/simple-alert.sh
Executable file
@@ -0,0 +1,73 @@
|
||||
#!/bin/bash
|
||||
# Simple Alert Script
|
||||
# Sends alerts when Besu services are down
|
||||
# Can be extended to send email, Slack, etc.
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
PROJECT_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)"
|
||||
|
||||
# Configuration
|
||||
ALERT_EMAIL="${ALERT_EMAIL:-}"
|
||||
ALERT_LOG="$PROJECT_ROOT/logs/alerts.log"
|
||||
ALERT_SENT_LOG="$PROJECT_ROOT/logs/alerts-sent.log"
|
||||
|
||||
# Ensure log directory exists
|
||||
mkdir -p "$(dirname "$ALERT_LOG")"
|
||||
|
||||
log_alert() {
|
||||
local message="$1"
|
||||
local timestamp=$(date '+%Y-%m-%d %H:%M:%S')
|
||||
echo "[$timestamp] ALERT: $message" >> "$ALERT_LOG"
|
||||
|
||||
# Check if we've already sent this alert (avoid spam)
|
||||
local alert_key=$(echo "$message" | md5sum | cut -d' ' -f1)
|
||||
if ! grep -q "$alert_key" "$ALERT_SENT_LOG" 2>/dev/null; then
|
||||
echo "[$timestamp] $alert_key" >> "$ALERT_SENT_LOG"
|
||||
|
||||
# Send email if configured
|
||||
if [[ -n "$ALERT_EMAIL" ]] && command -v mail >/dev/null 2>&1; then
|
||||
echo "$message" | mail -s "Besu Alert: Container Issue" "$ALERT_EMAIL" 2>/dev/null || true
|
||||
fi
|
||||
|
||||
# Log to console
|
||||
echo "ALERT: $message"
|
||||
fi
|
||||
}
|
||||
|
||||
# Check all containers
|
||||
for vmid in 1000 1001 1002 1003 1004 1500 1501 1502 1503 2500 2501 2502; do
|
||||
# Check if container is running
|
||||
if ! pct status "$vmid" 2>/dev/null | grep -q running; then
|
||||
log_alert "Container $vmid is not running"
|
||||
continue
|
||||
fi
|
||||
|
||||
# Determine service name
|
||||
service_name=""
|
||||
if [[ $vmid -ge 1000 ]] && [[ $vmid -le 1004 ]]; then
|
||||
service_name="besu-validator"
|
||||
elif [[ $vmid -ge 1500 ]] && [[ $vmid -le 1503 ]]; then
|
||||
service_name="besu-sentry"
|
||||
elif [[ $vmid -ge 2500 ]] && [[ $vmid -le 2502 ]]; then
|
||||
service_name="besu-rpc"
|
||||
fi
|
||||
|
||||
# Check service status
|
||||
if [[ -n "$service_name" ]]; then
|
||||
if ! pct exec "$vmid" -- systemctl is-active --quiet "$service_name" 2>/dev/null; then
|
||||
log_alert "Service $service_name on container $vmid is not running"
|
||||
fi
|
||||
fi
|
||||
done
|
||||
|
||||
# Check disk space (alert if < 10% free)
|
||||
for vmid in 1000 1001 1002 1003 1004 1500 1501 1502 1503 2500 2501 2502; do
|
||||
if pct status "$vmid" 2>/dev/null | grep -q running; then
|
||||
disk_usage=$(pct exec "$vmid" -- df -h / | awk 'NR==2 {print $5}' | sed 's/%//' 2>/dev/null || echo "0")
|
||||
if [[ $disk_usage -gt 90 ]]; then
|
||||
log_alert "Container $vmid disk usage is at ${disk_usage}%"
|
||||
fi
|
||||
fi
|
||||
done
|
||||
Reference in New Issue
Block a user