Add surgical Besu validator operator helpers

This commit is contained in:
defiQUG
2026-04-13 21:41:35 -07:00
parent b7eebb87b3
commit ee1625a79b
5 changed files with 249 additions and 13 deletions

View File

@@ -5,13 +5,15 @@
# the rest stay at head so the restarted node syncs quickly and consensus can continue.
#
# Usage: ./scripts/maintenance/fix-block-production-staggered-restart.sh [--dry-run]
# Requires: SSH to Proxmox hosts (192.168.11.10 ML110, 192.168.11.11 R630-01, 192.168.11.12 R630-02)
# Requires: SSH to Proxmox hosts; VMID→host from scripts/lib/load-project-env.sh get_host_for_vmid
# (live: 10001002 r630-01, 10031004 r630-03 — not ML110).
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
PROJECT_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)"
source "${PROJECT_ROOT}/config/ip-addresses.conf" 2>/dev/null || true
# shellcheck source=/dev/null
source "${PROJECT_ROOT}/scripts/lib/load-project-env.sh"
DRY_RUN=false
[[ "${1:-}" == "--dry-run" ]] && DRY_RUN=true
@@ -25,15 +27,19 @@ log_info() { echo -e "${BLUE}[INFO]${NC} $1"; }
log_ok() { echo -e "${GREEN}[✓]${NC} $1"; }
log_warn() { echo -e "${YELLOW}[WARN]${NC} $1"; }
# Order: restart one at a time; wait between so restarted node can sync from others
# VMID : host
VALIDATORS=(
"1004:${PROXMOX_HOST_ML110:-192.168.11.10}"
"1003:${PROXMOX_HOST_ML110:-192.168.11.10}"
"1002:${PROXMOX_HOST_R630_01:-192.168.11.11}"
"1001:${PROXMOX_HOST_R630_01:-192.168.11.11}"
"1000:${PROXMOX_HOST_R630_01:-192.168.11.11}"
)
# Order: off r630-03 first, then r630-01 (runbook: spread restarts; last is 1000 on .11)
RESTART_ORDER=(1004 1003 1002 1001 1000)
build_validators() {
VALIDATORS=()
local v h
for v in "${RESTART_ORDER[@]}"; do
h="$(get_host_for_vmid "$v")"
VALIDATORS+=("${v}:${h}")
done
}
build_validators
WAIT_BETWEEN=90
RPC="${RPC_URL_138:-http://192.168.11.211:8545}"
@@ -55,10 +61,10 @@ for entry in "${VALIDATORS[@]}"; do
IFS=: read -r vmid host <<< "$entry"
log_info "Restarting validator $vmid on $host..."
if $DRY_RUN; then
echo " Would: ssh root@$host 'pct exec $vmid -- systemctl restart besu-validator'"
echo " Would: ssh root@$host 'pct exec $vmid -- systemctl restart besu-validator.service'"
else
# Allow up to 120s for restart (Besu stop/start can take 1-2 min)
if timeout 120 ssh -o ConnectTimeout=10 -o StrictHostKeyChecking=no root@"$host" "pct exec $vmid -- systemctl restart besu-validator" 2>/dev/null; then
if timeout 120 ssh -o ConnectTimeout=10 -o StrictHostKeyChecking=no root@"$host" "pct exec $vmid -- systemctl restart besu-validator.service" 2>/dev/null; then
log_ok " $vmid restarted"
else
log_warn " $vmid restart timed out or failed (node may still be restarting)"