Add surgical Besu validator operator helpers

This commit is contained in:
defiQUG
2026-04-13 21:41:35 -07:00
parent b7eebb87b3
commit ee1625a79b
5 changed files with 249 additions and 13 deletions

View File

@@ -0,0 +1,62 @@
#!/usr/bin/env bash
# Offload one Besu validator LXC from r630-01 to r630-04 to reduce *real* CPU contention on the
# source host (same JVM work, fewer co-scheduled Besu processes per NUMA/socket).
#
# Default VMID 1001 (historically problematic when co-packed with 1000/1002); override with --vmid.
# Does not change Besu *allocation* inside the guest — it spreads physical load across nodes.
#
# Usage:
# bash scripts/operator/migrate-besu-validator-r630-01-to-r630-04.sh
# bash scripts/operator/migrate-besu-validator-r630-01-to-r630-04.sh --apply
# bash scripts/operator/migrate-besu-validator-r630-01-to-r630-04.sh --vmid 1002 --apply
#
# Requires: cluster membership, r630-04 online, storage (default local-lvm on target).
# Optional: PROXMOX_OPS_ALLOWED_VMIDS, PROXMOX_OPS_APPLY (see proxmox-production-guard.sh).
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
PROJECT_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)"
# shellcheck source=/dev/null
source "${PROJECT_ROOT}/config/ip-addresses.conf" 2>/dev/null || true
# shellcheck source=/dev/null
source "${PROJECT_ROOT}/scripts/lib/proxmox-production-guard.sh"
SRC="${PROXMOX_HOST_R630_01:-192.168.11.11}"
DST_NODE="${BESU_VALIDATOR_MIGRATE_TARGET_NODE:-r630-04}"
STORE="${BESU_VALIDATOR_MIGRATE_TARGET_STORAGE:-local-lvm}"
VMID="${BESU_VALIDATOR_MIGRATE_VMID:-1001}"
SSH_OPTS=(-o ConnectTimeout=20 -o BatchMode=yes -o StrictHostKeyChecking=no)
APPLY=false
while [[ $# -gt 0 ]]; do
case "$1" in
--apply) APPLY=true ;;
--vmid)
[[ $# -ge 2 ]] || exit 2
VMID="$2"
shift 2
continue
;;
-h|--help) sed -n '1,28p' "$0"; exit 0 ;;
*) echo "Unknown: $1" >&2; exit 2 ;;
esac
shift
done
# PVE 9+: --target-storage (see migrate-ml110-besu-rpc-to-r630-02-03.sh). Older clusters may use --storage.
CMD="pct migrate ${VMID} ${DST_NODE} --target-storage ${STORE} --restart 1"
if ! pguard_require_apply_flag "$APPLY"; then
echo "[dry-run] From source node (run as root on node that currently owns the CT):"
echo " ssh root@${SRC} \"$CMD\""
echo ""
echo "After migrate, update get_host_for_vmid in scripts/lib/load-project-env.sh for VMID ${VMID}."
exit 0
fi
pguard_vmid_allowed "$VMID" || exit 1
echo "[apply] ssh root@${SRC} \"$CMD\""
ssh "${SSH_OPTS[@]}" "root@${SRC}" "$CMD"
echo "[apply] Done. Update scripts/lib/load-project-env.sh get_host_for_vmid for ${VMID} -> ${DST_NODE} IP (${PROXMOX_HOST_R630_04:-192.168.11.14})."

View File

@@ -0,0 +1,65 @@
#!/usr/bin/env bash
# Surgical: restart exactly one Besu validator LXC service (default VMID 1001 — stuck participant).
# Resolves PVE host via get_host_for_vmid (scripts/lib/load-project-env.sh). No other CTs touched.
#
# Usage:
# bash scripts/operator/restart-besu-validator-single.sh --dry-run
# PROXMOX_OPS_APPLY=1 PROXMOX_OPS_ALLOWED_VMIDS=1001 bash scripts/operator/restart-besu-validator-single.sh --vmid 1001 --apply
#
# Requires: LAN SSH to Proxmox. Mutations require --apply or PROXMOX_OPS_APPLY=1 and (if set) allowlist.
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
PROJECT_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)"
# shellcheck source=/dev/null
source "${PROJECT_ROOT}/scripts/lib/load-project-env.sh"
# shellcheck source=/dev/null
source "${PROJECT_ROOT}/scripts/lib/proxmox-production-guard.sh"
VMID="${BESU_SURGICAL_RESTART_VMID:-1001}"
APPLY=false
DRY=false
SSH_OPTS=(-o ConnectTimeout=15 -o BatchMode=yes -o StrictHostKeyChecking=no)
usage() {
sed -n '1,18p' "$0"
}
while [[ $# -gt 0 ]]; do
case "$1" in
--vmid) VMID="$2"; shift 2 ;;
--apply) APPLY=true; shift ;;
--dry-run) DRY=true; shift ;;
-h|--help) usage; exit 0 ;;
*) echo "Unknown: $1" >&2; usage >&2; exit 2 ;;
esac
done
[[ "$VMID" =~ ^[0-9]+$ ]] || { echo "Bad vmid: $VMID" >&2; exit 2; }
host="$(get_host_for_vmid "$VMID")"
unit="besu-validator.service"
if $DRY || ! pguard_require_apply_flag "$APPLY"; then
echo "[dry-run] ssh root@${host} pct exec ${VMID} -- systemctl restart ${unit}"
echo "[dry-run] Then: cast block-number --rpc-url \${RPC_URL_138:-http://192.168.11.211:8545} (repeat)"
exit 0
fi
pguard_vmid_allowed "$VMID" || exit 1
echo "[apply] VMID ${VMID} on ${host}: systemctl restart ${unit}"
if ssh "${SSH_OPTS[@]}" "root@${host}" "pct exec ${VMID} -- systemctl restart ${unit}"; then
echo "[apply] restart command returned 0"
else
echo "[apply] restart failed (exit $?)" >&2
exit 1
fi
sleep 5
if ssh "${SSH_OPTS[@]}" "root@${host}" "pct exec ${VMID} -- systemctl is-active ${unit}" 2>/dev/null | grep -q active; then
echo "[apply] ${unit} is active"
else
echo "[apply] WARN: service may not be active yet; check journal on CT ${VMID}" >&2
fi

View File

@@ -0,0 +1,68 @@
#!/usr/bin/env bash
# Revert Proxmox cgroup CPU caps (cores/cpulimit) applied on r630-01 for Besu validators,
# core RPC, and sentries. Those caps throttle *allocation*; they do not reduce Besu's real
# work — use migration / JVM tuning / fewer co-located JVMs to lower *measured* host load.
#
# Restores:
# 10001002: cores 4, cpulimit removed (was 2/1)
# 2101: cores 4, cpulimit removed (was 2/2)
# 15001502: cores 2, cpulimit removed (was 2/1)
#
# Usage:
# bash scripts/operator/revert-besu-cgroup-caps-r630-01.sh
# bash scripts/operator/revert-besu-cgroup-caps-r630-01.sh --apply
#
# Requires: PROXMOX_OPS_APPLY=1 or --apply (see scripts/lib/proxmox-production-guard.sh).
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
PROJECT_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)"
# shellcheck source=/dev/null
source "${PROJECT_ROOT}/config/ip-addresses.conf" 2>/dev/null || true
# shellcheck source=/dev/null
source "${PROJECT_ROOT}/scripts/lib/proxmox-production-guard.sh"
HOST="${PROXMOX_HOST_R630_01:-192.168.11.11}"
SSH_OPTS=(-o ConnectTimeout=12 -o BatchMode=yes -o StrictHostKeyChecking=no)
APPLY=false
while [[ $# -gt 0 ]]; do
case "$1" in
--apply) APPLY=true ;;
-h|--help) sed -n '1,25p' "$0"; exit 0 ;;
*) echo "Unknown: $1" >&2; exit 2 ;;
esac
shift
done
revert_validator() {
local vmid="$1"
echo " pct set $vmid --cores 4 --delete cpulimit"
}
revert_sentry() {
local vmid="$1"
echo " pct set $vmid --cores 2 --delete cpulimit"
}
if ! pguard_require_apply_flag "$APPLY"; then
echo "[dry-run] On root@${HOST}:"
for v in 1000 1001 1002; do revert_validator "$v"; done
echo " pct set 2101 --cores 4 --delete cpulimit"
for v in 1500 1501 1502; do revert_sentry "$v"; done
exit 0
fi
for v in 1000 1001 1002; do
pguard_vmid_allowed "$v" || continue
ssh "${SSH_OPTS[@]}" "root@${HOST}" "pct set '$v' --cores 4 --delete cpulimit"
done
pguard_vmid_allowed "2101" && ssh "${SSH_OPTS[@]}" "root@${HOST}" "pct set 2101 --cores 4 --delete cpulimit" || true
for v in 1500 1501 1502; do
pguard_vmid_allowed "$v" || continue
ssh "${SSH_OPTS[@]}" "root@${HOST}" "pct set '$v' --cores 2 --delete cpulimit"
done
echo "--- post (pct + host cgroup sample) ---"
ssh "${SSH_OPTS[@]}" "root@${HOST}" "for v in 1000 2101 1500; do echo \"=== \$v ===\"; pct config \$v | grep -E '^(cores|cpulimit):' || true; echo -n \" cgroup cpu.max: \"; cat /sys/fs/cgroup/lxc/\$v/cpu.max; echo; done; uptime"