Add surgical Besu validator operator helpers

2026-04-13 21:41:35 -07:00
parent b7eebb87b3
commit ee1625a79b
5 changed files with 249 additions and 13 deletions
--- a/scripts/operator/migrate-besu-validator-r630-01-to-r630-04.sh
+++ b/scripts/operator/migrate-besu-validator-r630-01-to-r630-04.sh
@@ -0,0 +1,62 @@
+#!/usr/bin/env bash
+# Offload one Besu validator LXC from r630-01 to r630-04 to reduce *real* CPU contention on the
+# source host (same JVM work, fewer co-scheduled Besu processes per NUMA/socket).
+#
+# Default VMID 1001 (historically problematic when co-packed with 1000/1002); override with --vmid.
+# Does not change Besu *allocation* inside the guest — it spreads physical load across nodes.
+#
+# Usage:
+#   bash scripts/operator/migrate-besu-validator-r630-01-to-r630-04.sh
+#   bash scripts/operator/migrate-besu-validator-r630-01-to-r630-04.sh --apply
+#   bash scripts/operator/migrate-besu-validator-r630-01-to-r630-04.sh --vmid 1002 --apply
+#
+# Requires: cluster membership, r630-04 online, storage (default local-lvm on target).
+# Optional: PROXMOX_OPS_ALLOWED_VMIDS, PROXMOX_OPS_APPLY (see proxmox-production-guard.sh).
+
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+PROJECT_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)"
+# shellcheck source=/dev/null
+source "${PROJECT_ROOT}/config/ip-addresses.conf" 2>/dev/null || true
+# shellcheck source=/dev/null
+source "${PROJECT_ROOT}/scripts/lib/proxmox-production-guard.sh"
+
+SRC="${PROXMOX_HOST_R630_01:-192.168.11.11}"
+DST_NODE="${BESU_VALIDATOR_MIGRATE_TARGET_NODE:-r630-04}"
+STORE="${BESU_VALIDATOR_MIGRATE_TARGET_STORAGE:-local-lvm}"
+VMID="${BESU_VALIDATOR_MIGRATE_VMID:-1001}"
+SSH_OPTS=(-o ConnectTimeout=20 -o BatchMode=yes -o StrictHostKeyChecking=no)
+
+APPLY=false
+while [[ $# -gt 0 ]]; do
+  case "$1" in
+    --apply) APPLY=true ;;
+    --vmid)
+      [[ $# -ge 2 ]] || exit 2
+      VMID="$2"
+      shift 2
+      continue
+      ;;
+    -h|--help) sed -n '1,28p' "$0"; exit 0 ;;
+    *) echo "Unknown: $1" >&2; exit 2 ;;
+  esac
+  shift
+done
+
+# PVE 9+: --target-storage (see migrate-ml110-besu-rpc-to-r630-02-03.sh). Older clusters may use --storage.
+CMD="pct migrate ${VMID} ${DST_NODE} --target-storage ${STORE} --restart 1"
+
+if ! pguard_require_apply_flag "$APPLY"; then
+  echo "[dry-run] From source node (run as root on node that currently owns the CT):"
+  echo "  ssh root@${SRC} \"$CMD\""
+  echo ""
+  echo "After migrate, update get_host_for_vmid in scripts/lib/load-project-env.sh for VMID ${VMID}."
+  exit 0
+fi
+
+pguard_vmid_allowed "$VMID" || exit 1
+
+echo "[apply] ssh root@${SRC} \"$CMD\""
+ssh "${SSH_OPTS[@]}" "root@${SRC}" "$CMD"
+echo "[apply] Done. Update scripts/lib/load-project-env.sh get_host_for_vmid for ${VMID} -> ${DST_NODE} IP (${PROXMOX_HOST_R630_04:-192.168.11.14})."
--- a/scripts/operator/restart-besu-validator-single.sh
+++ b/scripts/operator/restart-besu-validator-single.sh
@@ -0,0 +1,65 @@
+#!/usr/bin/env bash
+# Surgical: restart exactly one Besu validator LXC service (default VMID 1001 — stuck participant).
+# Resolves PVE host via get_host_for_vmid (scripts/lib/load-project-env.sh). No other CTs touched.
+#
+# Usage:
+#   bash scripts/operator/restart-besu-validator-single.sh --dry-run
+#   PROXMOX_OPS_APPLY=1 PROXMOX_OPS_ALLOWED_VMIDS=1001 bash scripts/operator/restart-besu-validator-single.sh --vmid 1001 --apply
+#
+# Requires: LAN SSH to Proxmox. Mutations require --apply or PROXMOX_OPS_APPLY=1 and (if set) allowlist.
+
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+PROJECT_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)"
+# shellcheck source=/dev/null
+source "${PROJECT_ROOT}/scripts/lib/load-project-env.sh"
+# shellcheck source=/dev/null
+source "${PROJECT_ROOT}/scripts/lib/proxmox-production-guard.sh"
+
+VMID="${BESU_SURGICAL_RESTART_VMID:-1001}"
+APPLY=false
+DRY=false
+SSH_OPTS=(-o ConnectTimeout=15 -o BatchMode=yes -o StrictHostKeyChecking=no)
+
+usage() {
+  sed -n '1,18p' "$0"
+}
+
+while [[ $# -gt 0 ]]; do
+  case "$1" in
+    --vmid) VMID="$2"; shift 2 ;;
+    --apply) APPLY=true; shift ;;
+    --dry-run) DRY=true; shift ;;
+    -h|--help) usage; exit 0 ;;
+    *) echo "Unknown: $1" >&2; usage >&2; exit 2 ;;
+  esac
+done
+
+[[ "$VMID" =~ ^[0-9]+$ ]] || { echo "Bad vmid: $VMID" >&2; exit 2; }
+
+host="$(get_host_for_vmid "$VMID")"
+unit="besu-validator.service"
+
+if $DRY || ! pguard_require_apply_flag "$APPLY"; then
+  echo "[dry-run] ssh root@${host} pct exec ${VMID} -- systemctl restart ${unit}"
+  echo "[dry-run] Then: cast block-number --rpc-url \${RPC_URL_138:-http://192.168.11.211:8545} (repeat)"
+  exit 0
+fi
+
+pguard_vmid_allowed "$VMID" || exit 1
+
+echo "[apply] VMID ${VMID} on ${host}: systemctl restart ${unit}"
+if ssh "${SSH_OPTS[@]}" "root@${host}" "pct exec ${VMID} -- systemctl restart ${unit}"; then
+  echo "[apply] restart command returned 0"
+else
+  echo "[apply] restart failed (exit $?)" >&2
+  exit 1
+fi
+
+sleep 5
+if ssh "${SSH_OPTS[@]}" "root@${host}" "pct exec ${VMID} -- systemctl is-active ${unit}" 2>/dev/null | grep -q active; then
+  echo "[apply] ${unit} is active"
+else
+  echo "[apply] WARN: service may not be active yet; check journal on CT ${VMID}" >&2
+fi
--- a/scripts/operator/revert-besu-cgroup-caps-r630-01.sh
+++ b/scripts/operator/revert-besu-cgroup-caps-r630-01.sh
@@ -0,0 +1,68 @@
+#!/usr/bin/env bash
+# Revert Proxmox cgroup CPU caps (cores/cpulimit) applied on r630-01 for Besu validators,
+# core RPC, and sentries. Those caps throttle *allocation*; they do not reduce Besu's real
+# work — use migration / JVM tuning / fewer co-located JVMs to lower *measured* host load.
+#
+# Restores:
+#   1000–1002: cores 4, cpulimit removed (was 2/1)
+#   2101:      cores 4, cpulimit removed (was 2/2)
+#   1500–1502: cores 2, cpulimit removed (was 2/1)
+#
+# Usage:
+#   bash scripts/operator/revert-besu-cgroup-caps-r630-01.sh
+#   bash scripts/operator/revert-besu-cgroup-caps-r630-01.sh --apply
+#
+# Requires: PROXMOX_OPS_APPLY=1 or --apply (see scripts/lib/proxmox-production-guard.sh).
+
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+PROJECT_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)"
+# shellcheck source=/dev/null
+source "${PROJECT_ROOT}/config/ip-addresses.conf" 2>/dev/null || true
+# shellcheck source=/dev/null
+source "${PROJECT_ROOT}/scripts/lib/proxmox-production-guard.sh"
+
+HOST="${PROXMOX_HOST_R630_01:-192.168.11.11}"
+SSH_OPTS=(-o ConnectTimeout=12 -o BatchMode=yes -o StrictHostKeyChecking=no)
+
+APPLY=false
+while [[ $# -gt 0 ]]; do
+  case "$1" in
+    --apply) APPLY=true ;;
+    -h|--help) sed -n '1,25p' "$0"; exit 0 ;;
+    *) echo "Unknown: $1" >&2; exit 2 ;;
+  esac
+  shift
+done
+
+revert_validator() {
+  local vmid="$1"
+  echo "  pct set $vmid --cores 4 --delete cpulimit"
+}
+
+revert_sentry() {
+  local vmid="$1"
+  echo "  pct set $vmid --cores 2 --delete cpulimit"
+}
+
+if ! pguard_require_apply_flag "$APPLY"; then
+  echo "[dry-run] On root@${HOST}:"
+  for v in 1000 1001 1002; do revert_validator "$v"; done
+  echo "  pct set 2101 --cores 4 --delete cpulimit"
+  for v in 1500 1501 1502; do revert_sentry "$v"; done
+  exit 0
+fi
+
+for v in 1000 1001 1002; do
+  pguard_vmid_allowed "$v" || continue
+  ssh "${SSH_OPTS[@]}" "root@${HOST}" "pct set '$v' --cores 4 --delete cpulimit"
+done
+pguard_vmid_allowed "2101" && ssh "${SSH_OPTS[@]}" "root@${HOST}" "pct set 2101 --cores 4 --delete cpulimit" || true
+for v in 1500 1501 1502; do
+  pguard_vmid_allowed "$v" || continue
+  ssh "${SSH_OPTS[@]}" "root@${HOST}" "pct set '$v' --cores 2 --delete cpulimit"
+done
+
+echo "--- post (pct + host cgroup sample) ---"
+ssh "${SSH_OPTS[@]}" "root@${HOST}" "for v in 1000 2101 1500; do echo \"=== \$v ===\"; pct config \$v | grep -E '^(cores|cpulimit):' || true; echo -n \"  cgroup cpu.max: \"; cat /sys/fs/cgroup/lxc/\$v/cpu.max; echo; done; uptime"