#!/usr/bin/env bash # Idempotent remediation for RPC node stability on Proxmox. # # What it fixes (optionally): # 1) Storage node restriction mismatch: # - Ensures the storage backing RPC VMID rootfs (e.g., local-lvm) is allowed on the node # where the VMID is running (prevents "storage 'local-lvm' is not available on node ..." failures). # 2) Besu heap oversizing: # - Ensures BESU_OPTS (-Xms/-Xmx) in /etc/systemd/system/besu-rpc.service is sized to container memory. # # Safety: # - Default is DRY-RUN (no changes). # - Use --apply to perform changes. # - Service restarts are opt-in via --restart-besu. # # Usage: # PROXMOX_HOST=${PROXMOX_HOST_ML110:-192.168.11.10} ./scripts/remediate-proxmox-rpc-stability.sh # PROXMOX_HOST=${PROXMOX_HOST_ML110:-192.168.11.10} ./scripts/remediate-proxmox-rpc-stability.sh --apply --restart-besu # # Options: # --apply Apply changes (otherwise dry-run) # --restart-besu Restart besu-rpc inside affected VMIDs (only with --apply) # --only-storage Only apply storage.cfg remediation # --only-heap Only apply heap remediation # --vmids "..." Override VMID list (space-separated) set -euo pipefail # Load IP configuration SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" PROJECT_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)" source "${PROJECT_ROOT}/config/ip-addresses.conf" 2>/dev/null || true PROXMOX_SSH_USER="${PROXMOX_SSH_USER:-${PROXMOX_USER:-root}}" [[ "$PROXMOX_SSH_USER" == *"@"* ]] && PROXMOX_SSH_USER="root" PROXMOX_HOST="${PROXMOX_HOST:-192.168.11.10}" APPLY=0 RESTART_BESU=0 ONLY_STORAGE=0 ONLY_HEAP=0 VMIDS_DEFAULT=(2400 2401 2402 2500 2501 2502 2503 2504 2505 2506 2507 2508) VMIDS=("${VMIDS_DEFAULT[@]}") usage() { sed -n '1,80p' "$0" | sed 's/^# \{0,1\}//' } log() { echo "[$(date -Is)] $*"; } die() { echo "ERROR: $*" >&2; exit 1; } while [[ $# -gt 0 ]]; do case "$1" in --apply) APPLY=1; shift ;; --restart-besu) RESTART_BESU=1; shift ;; --only-storage) ONLY_STORAGE=1; shift ;; --only-heap) ONLY_HEAP=1; shift ;; --vmids) shift [[ $# -gt 0 ]] || die "--vmids requires a value" # shellcheck disable=SC2206 VMIDS=($1) shift ;; -h|--help) usage; exit 0 ;; *) die "Unknown arg: $1 (use --help)" ;; esac done if [[ $ONLY_STORAGE -eq 1 && $ONLY_HEAP -eq 1 ]]; then die "Choose at most one of --only-storage / --only-heap" fi if [[ $RESTART_BESU -eq 1 && $APPLY -ne 1 ]]; then die "--restart-besu requires --apply" fi ssh_pve() { ssh -o StrictHostKeyChecking=no -o BatchMode=yes -o ConnectTimeout=6 "${PROXMOX_SSH_USER}@${PROXMOX_HOST}" "$@" } remote_node="$(ssh_pve "hostname" 2>/dev/null || true)" [[ -n "${remote_node}" ]] || die "Unable to SSH to ${PROXMOX_SSH_USER}@${PROXMOX_HOST}" log "Proxmox host: ${PROXMOX_HOST} (node name: ${remote_node})" log "Mode: $([[ $APPLY -eq 1 ]] && echo APPLY || echo DRY-RUN)" log "VMIDs: ${VMIDS[*]}" echo recommend_heap() { # Input: memory MB # Output: Xms Xmx (strings suitable for BESU_OPTS) local mem_mb="$1" if [[ "$mem_mb" =~ ^[0-9]+$ ]]; then : else echo "1g 2g" return 0 fi if (( mem_mb >= 16384 )); then echo "8g 8g" elif (( mem_mb >= 8192 )); then echo "2g 4g" elif (( mem_mb >= 6144 )); then echo "2g 4g" elif (( mem_mb >= 4096 )); then echo "1g 2g" else echo "512m 1g" fi } get_vmid_field() { # Usage: get_vmid_field (e.g. memory, swap, rootfs, hostname) local vmid="$1" local field="$2" ssh_pve "pct config ${vmid} 2>/dev/null | sed -n 's/^${field}: //p' | head -1" 2>/dev/null | tr -d '\r' } vmid_status() { local vmid="$1" ssh_pve "pct status ${vmid} 2>/dev/null | sed -n 's/^status: //p'" 2>/dev/null | tr -d '\r' } ######################################## # 1) Storage remediation (storage.cfg) ######################################## storage_changes=0 if [[ $ONLY_HEAP -ne 1 ]]; then log "Storage remediation: scanning VMID rootfs storages vs storage.cfg node allowlist" storages_needed=() for vmid in "${VMIDS[@]}"; do st="$(vmid_status "$vmid" || true)" rootfs="$(get_vmid_field "$vmid" "rootfs" || true)" [[ -n "${rootfs}" ]] || continue storage="${rootfs%%:*}" if [[ -n "${storage}" ]]; then storages_needed+=("${storage}") fi log " VMID ${vmid}: status=${st:-?} rootfs=${rootfs}" done # Unique storages unique_storages=() while IFS= read -r s; do unique_storages+=("$s"); done < <(printf "%s\n" "${storages_needed[@]}" | sort -u) if [[ ${#unique_storages[@]} -eq 0 ]]; then log " No storages detected from VMID rootfs; skipping storage remediation." else log " Storages referenced by VMID rootfs: ${unique_storages[*]}" fi for storage in "${unique_storages[@]}"; do # Only handle storages defined in storage.cfg and restricted by nodes=. # If nodes= isn't present, it's cluster-wide. allowed_nodes="$(ssh_pve "python3 - <<'PY' from pathlib import Path cfg = Path('/etc/pve/storage.cfg').read_text(encoding='utf-8') storage = ${storage@Q} stype = None in_section = False nodes = None for line in cfg.splitlines(): if line.startswith('dir: ') or line.startswith('lvmthin: ') or line.startswith('zfspool: ') or line.startswith('lvm: '): in_section = line.split(':',1)[1].strip() == storage nodes = None continue if in_section and line.strip().startswith('nodes '): nodes = line.strip().split(None,1)[1] break print(nodes or '') PY" 2>/dev/null | tr -d '\r')" if [[ -z "${allowed_nodes}" ]]; then log " Storage '${storage}': no nodes restriction found (OK)" continue fi if echo "${allowed_nodes}" | tr ',' '\n' | grep -qx "${remote_node}"; then log " Storage '${storage}': node '${remote_node}' already allowed (OK)" continue fi storage_changes=$((storage_changes+1)) log " Storage '${storage}': node '${remote_node}' NOT allowed (nodes=${allowed_nodes})" if [[ $APPLY -eq 1 ]]; then log " Applying: add '${remote_node}' to storage.cfg for ${storage}" ssh_pve "bash -s" </dev/null || true CFG=/etc/pve/storage.cfg TS=\$(date +%Y%m%d_%H%M%S) cp -a "\$CFG" "/root/storage.cfg.bak.\$TS" python3 - <<'PY' from __future__ import annotations from pathlib import Path cfg = Path('/etc/pve/storage.cfg') storage = ${storage@Q} node = ${remote_node@Q} lines = cfg.read_text(encoding='utf-8').splitlines(True) out = [] in_section = False updated = False for line in lines: if line.startswith('dir: ') or line.startswith('lvmthin: ') or line.startswith('zfspool: ') or line.startswith('lvm: '): in_section = line.split(':',1)[1].strip() == storage out.append(line) continue if in_section and line.lstrip().startswith('nodes '): indent = line[: len(line) - len(line.lstrip())] nodes_str = line.strip().split(None, 1)[1] if len(line.strip().split(None, 1)) > 1 else '' parts = [p.strip() for p in nodes_str.split(',') if p.strip()] if node not in parts: parts.append(node) updated = True out.append(f"{indent}nodes {','.join(parts)}\\n") continue out.append(line) cfg.write_text(''.join(out), encoding='utf-8') print('updated' if updated else 'no_change') PY EOS else log " DRY-RUN: would add '${remote_node}' to storage.cfg nodes= for storage '${storage}'" fi done echo fi ######################################## # 2) Heap remediation (BESU_OPTS) ######################################## heap_changes=0 if [[ $ONLY_STORAGE -ne 1 ]]; then log "Besu heap remediation: scanning BESU_OPTS vs container memory" UNIT="/etc/systemd/system/besu-rpc.service" for vmid in "${VMIDS[@]}"; do st="$(vmid_status "$vmid" || true)" mem="$(get_vmid_field "$vmid" "memory" || true)" hostn="$(get_vmid_field "$vmid" "hostname" || true)" rec="$(recommend_heap "${mem:-0}")" xms="${rec%% *}" xmx="${rec##* }" needs=0 if [[ "${st}" != "running" ]]; then log " VMID ${vmid} (${hostn:-?}): status=${st:-?} -> skipping heap check" continue fi current_line="$(ssh_pve "pct exec ${vmid} -- bash -lc \"grep -n 'BESU_OPTS' ${UNIT} 2>/dev/null | head -1\"" 2>/dev/null | tr -d '\r' || true)" if [[ -z "${current_line}" ]]; then log " VMID ${vmid} (${hostn:-?} mem=${mem}MB): BESU_OPTS line missing -> skipping" continue fi if echo "${current_line}" | grep -q -- "-Xms${xms}"; then : else needs=1 fi if echo "${current_line}" | grep -q -- "-Xmx${xmx}"; then : else needs=1 fi if [[ "${needs}" -eq 0 ]]; then log " VMID ${vmid} (${hostn:-?} mem=${mem}MB): OK (${current_line})" continue fi heap_changes=$((heap_changes+1)) log " VMID ${vmid} (${hostn:-?} mem=${mem}MB): needs heap update -> -Xms${xms} -Xmx${xmx}" log " current: ${current_line}" if [[ $APPLY -eq 1 ]]; then ts="$(date +%Y%m%d_%H%M%S)" log " Applying: update ${UNIT} (backup .bak.${ts})" ssh_pve "pct exec ${vmid} -- bash -lc \"set -e; cp -a ${UNIT} ${UNIT}.bak.${ts}; sed -i 's/^Environment=\\\"BESU_OPTS=.*/Environment=\\\"BESU_OPTS=-Xms${xms} -Xmx${xmx}\\\"/' ${UNIT}; grep -n 'BESU_OPTS' ${UNIT}\"" if [[ $RESTART_BESU -eq 1 ]]; then log " Restarting besu-rpc" ssh_pve "pct exec ${vmid} -- bash -lc \"set -e; systemctl daemon-reload; systemctl restart besu-rpc\"" else log " NOTE: besu-rpc not restarted (use --restart-besu)" fi else log " DRY-RUN: would set BESU_OPTS=-Xms${xms} -Xmx${xmx} and optionally restart" fi unset needs done echo fi log "Done." log "Planned/applied changes summary:" log " storage adjustments needed: ${storage_changes}" log " heap adjustments needed: ${heap_changes}" if [[ $APPLY -eq 0 ]]; then log "Run again with --apply (and optionally --restart-besu) to enforce changes." fi