Files
proxmox/scripts/maintenance/run-all-maintenance-via-proxmox-ssh.sh
defiQUG bea1903ac9
Some checks failed
Deploy to Phoenix / deploy (push) Has been cancelled
Sync all local changes: docs, config, scripts, submodule refs, verification evidence
Co-authored-by: Cursor <cursoragent@cursor.com>
2026-02-21 15:46:06 -08:00

169 lines
6.4 KiB
Bash
Executable File

#!/usr/bin/env bash
# Run all maintenance/fix scripts that use SSH to Proxmox VE hosts (r630-01, ml110, r630-02).
# Run from project root. Requires: SSH key-based auth to root@<each host>, LAN access.
#
# Usage:
# ./scripts/maintenance/run-all-maintenance-via-proxmox-ssh.sh
# ./scripts/maintenance/run-all-maintenance-via-proxmox-ssh.sh --no-npm
# ./scripts/maintenance/run-all-maintenance-via-proxmox-ssh.sh --e2e
# ./scripts/maintenance/run-all-maintenance-via-proxmox-ssh.sh --verbose # show all step output (no 2>/dev/null)
# ./scripts/maintenance/run-all-maintenance-via-proxmox-ssh.sh --dry-run
#
# Step 2 (2101 fix) can be slow (apt in CT). Timeout: STEP2_TIMEOUT=900 (default) or 0 to disable.
#
# Scripts run (each SSHs to the Proxmox hosts):
# 0. make-rpc-vmids-writable-via-ssh.sh — Stop 2101,2500-2505; e2fsck rootfs; start (r630-01)
# 1. resolve-and-fix-all-via-proxmox-ssh.sh — Dev VM IP, start containers, DBIS (r630-01, ml110)
# 2. fix-rpc-2101-jna-reinstall.sh — 2101 Besu reinstall (r630-01)
# 3. install-besu-permanent-on-missing-nodes.sh — Besu on 2500-2505, 1505-1508 (r630-01, ml110)
# 4. address-all-remaining-502s.sh — backends + NPM proxy + RPC diagnostics
# 5. [optional] verify-end-to-end-routing.sh — E2E (if --e2e)
#
# See: docs/00-meta/502_DEEP_DIVE_ROOT_CAUSES_AND_FIXES.md, docs/05-network/CHECK_ALL_UPDATES_AND_CLOUDFLARE_TUNNELS.md
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
PROJECT_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)"
cd "$PROJECT_ROOT"
[[ -f "${PROJECT_ROOT}/scripts/lib/load-project-env.sh" ]] && source "${PROJECT_ROOT}/scripts/lib/load-project-env.sh" 2>/dev/null || true
[[ -f "${PROJECT_ROOT}/config/ip-addresses.conf" ]] && source "${PROJECT_ROOT}/config/ip-addresses.conf" 2>/dev/null || true
R630_01="${PROXMOX_HOST_R630_01:-192.168.11.11}"
ML110="${PROXMOX_HOST_ML110:-192.168.11.10}"
R630_02="${PROXMOX_HOST_R630_02:-192.168.11.12}"
SKIP_NPM=false
RUN_E2E=false
DRY_RUN=false
VERBOSE=false
for arg in "${@:-}"; do
[[ "$arg" == "--no-npm" ]] && SKIP_NPM=true
[[ "$arg" == "--e2e" ]] && RUN_E2E=true
[[ "$arg" == "--dry-run" ]] && DRY_RUN=true
[[ "$arg" == "--verbose" ]] && VERBOSE=true
done
# Step 2 (2101 fix) timeout in seconds; 0 = no timeout
STEP2_TIMEOUT="${STEP2_TIMEOUT:-900}"
run_step() {
if $VERBOSE; then
bash "$@"
else
bash "$@" 2>/dev/null
fi
}
echo ""
echo "=== Run all maintenance via Proxmox SSH ==="
echo " Hosts: r630-01=$R630_01 ml110=$ML110 r630-02=$R630_02"
echo " --no-npm=$SKIP_NPM --e2e=$RUN_E2E --verbose=$VERBOSE --dry-run=$DRY_RUN STEP2_TIMEOUT=$STEP2_TIMEOUT"
echo ""
if $DRY_RUN; then
echo "Would run in order:"
echo " 0. make-rpc-vmids-writable-via-ssh.sh"
echo " 1. resolve-and-fix-all-via-proxmox-ssh.sh"
echo " 2. fix-rpc-2101-jna-reinstall.sh (timeout=${STEP2_TIMEOUT}s if set)"
echo " 3. install-besu-permanent-on-missing-nodes.sh"
echo " 4. address-all-remaining-502s.sh $($SKIP_NPM && echo '--no-npm')"
$RUN_E2E && echo " 5. verify-end-to-end-routing.sh"
echo " Use --verbose to show all step output; STEP2_TIMEOUT=0 to disable step 2 timeout."
echo ""
exit 0
fi
# Quick SSH check to all hosts we need
for host in "$R630_01" "$ML110" "$R630_02"; do
if ssh -o ConnectTimeout=5 -o StrictHostKeyChecking=accept-new "root@$host" "echo OK" 2>/dev/null; then
echo " SSH $host OK"
else
echo " Warning: cannot SSH to $host (some steps may fail)"
fi
done
echo ""
# 0. Make RPC VMIDs writable (e2fsck so fix/install scripts can write)
echo "[0/5] Making RPC VMIDs writable..."
echo "--- 0/5: Make RPC VMIDs writable (r630-01: 2101, 2500-2505) ---"
if run_step "${SCRIPT_DIR}/make-rpc-vmids-writable-via-ssh.sh"; then
echo " Done."
else
echo " Step had warnings (check output)."
fi
echo ""
# 1. Resolve and fix (Dev VM IP, start containers, DBIS)
echo "[1/5] Resolve and fix (Dev VM, containers, DBIS)..."
echo "--- 1/5: Resolve and fix via Proxmox SSH (r630-01, ml110) ---"
if run_step "${SCRIPT_DIR}/resolve-and-fix-all-via-proxmox-ssh.sh"; then
echo " Done."
else
echo " Step had warnings (check output)."
fi
echo ""
# 2. Fix 2101 JNA reinstall (r630-01) — can be slow (apt in CT); optional timeout
echo "[2/5] Fix 2101 Besu JNA reinstall (may take several minutes)..."
echo "--- 2/5: Fix 2101 Besu JNA reinstall (r630-01) ---"
step2_ok=false
step2_ret=0
if [[ -n "${STEP2_TIMEOUT:-}" && "$STEP2_TIMEOUT" -gt 0 ]] && command -v timeout >/dev/null 2>&1; then
if $VERBOSE; then
timeout "$STEP2_TIMEOUT" bash "${SCRIPT_DIR}/fix-rpc-2101-jna-reinstall.sh" || step2_ret=$?
else
timeout "$STEP2_TIMEOUT" bash "${SCRIPT_DIR}/fix-rpc-2101-jna-reinstall.sh" 2>/dev/null || step2_ret=$?
fi
[[ $step2_ret -eq 0 ]] && step2_ok=true
[[ $step2_ret -eq 124 ]] && echo " Step 2 timed out after ${STEP2_TIMEOUT}s. Re-run manually: ./scripts/maintenance/fix-rpc-2101-jna-reinstall.sh"
else
if run_step "${SCRIPT_DIR}/fix-rpc-2101-jna-reinstall.sh"; then
step2_ok=true
fi
fi
$step2_ok && echo " Done."
echo ""
# 3. Install Besu on missing nodes (r630-01, ml110)
echo "[3/5] Install Besu on missing nodes..."
echo "--- 3/5: Install Besu on missing nodes (r630-01, ml110) ---"
if run_step "${PROJECT_ROOT}/scripts/besu/install-besu-permanent-on-missing-nodes.sh"; then
echo " Done."
else
echo " Step had failures (e.g. disk full or read-only CT)."
fi
echo ""
# 4. Address all remaining 502s (backends + NPM + diagnostics)
echo "[4/5] Address all remaining 502s (backends + NPM + diagnostics)..."
echo "--- 4/5: Address all remaining 502s (SSH to r630-01, r630-02) ---"
ADDR_ARGS=""
$SKIP_NPM && ADDR_ARGS="--no-npm"
if run_step "${SCRIPT_DIR}/address-all-remaining-502s.sh" $ADDR_ARGS; then
echo " Done."
else
echo " Step had warnings (check output)."
fi
echo ""
# 5. Optional E2E
if $RUN_E2E; then
echo "[5/5] E2E verification..."
echo "--- 5/5: E2E verification ---"
if [ -f "${PROJECT_ROOT}/scripts/verify/verify-end-to-end-routing.sh" ]; then
if $VERBOSE; then
E2E_ACCEPT_502_INTERNAL=1 bash "${PROJECT_ROOT}/scripts/verify/verify-end-to-end-routing.sh" || true
else
E2E_ACCEPT_502_INTERNAL=1 bash "${PROJECT_ROOT}/scripts/verify/verify-end-to-end-routing.sh" 2>/dev/null || true
fi
fi
echo ""
fi
echo "=== All maintenance steps (via Proxmox SSH) completed ==="
echo " Next: ./scripts/verify/verify-end-to-end-routing.sh"
echo " Reports: docs/04-configuration/verification-evidence/"
echo ""