proxmox/scripts/clear-all-transaction-pools.sh

#!/usr/bin/env bash
# Clear transaction pools on all Besu nodes (validators, Core/Thirdweb/public RPC).
#
# Stuck txs often reappear on Core when sentries or Alltra/Hybx RPC CTs (still on the P2P mesh) were not cleared.
# To include those: CLEAR_BESU_PEER_TXPOOLS=1 bash scripts/clear-all-transaction-pools.sh
# Peer tier (1500–1502, 2420/2430/2440/2460/2470/2480) uses pct stop + pct mount on the PVE host (avoids hung systemctl inside flaky CTs).
#
# SSH: uses PROXMOX_SSH_USER from config/ip-addresses.conf (root). If .env sets PROXMOX_USER=root@pam for the API,
# that value is NOT used for SSH (see PROXMOX_USER= assignment below).
# Afterward: Core RPC (2101) can take several minutes to bind JSON-RPC while RocksDB opens/compacts; public RPC may recover first.

set -euo pipefail

# Load IP configuration
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
PROJECT_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
source "${PROJECT_ROOT}/config/ip-addresses.conf" 2>/dev/null || true

# Peer-tier Besu CTs (sentries, Alltra/Hybx RPC) retain mempool and can re-gossip txs to Core after a clear.
# Default: skip them (faster). To flush those pools too: CLEAR_BESU_PEER_TXPOOLS=1 bash scripts/clear-all-transaction-pools.sh
CLEAR_BESU_PEER_TXPOOLS="${CLEAR_BESU_PEER_TXPOOLS:-0}"

# Shell SSH must be root@host — not root@pam@host (.env often sets PROXMOX_USER=root@pam for API).
PROXMOX_SSH_USER="${PROXMOX_SSH_USER:-root}"
[[ "$PROXMOX_SSH_USER" == *"@"* ]] && PROXMOX_SSH_USER="root"
PROXMOX_USER="$PROXMOX_SSH_USER"
PROXMOX_ML110="${PROXMOX_ML110:-${PROXMOX_HOST_ML110:-192.168.11.10}}"
PROXMOX_R630="${PROXMOX_R630:-${PROXMOX_R630_01:-${PROXMOX_HOST_R630_01:-192.168.11.11}}}"
R630_03="${PROXMOX_R630_03:-${PROXMOX_HOST_R630_03:-192.168.11.13}}"
R630_02="${PROXMOX_R630_02:-${PROXMOX_HOST_R630_02:-192.168.11.12}}"

# Colors
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
BLUE='\033[0;34m'
CYAN='\033[0;36m'
NC='\033[0m'

log_info() { echo -e "${BLUE}[INFO]${NC} $1"; }
log_success() { echo -e "${GREEN}[✓]${NC} $1"; }
log_warn() { echo -e "${YELLOW}[⚠]${NC} $1"; }
log_error() { echo -e "${RED}[✗]${NC} $1"; }
log_section() { echo -e "\n${CYAN}━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━${NC}"; echo -e "${CYAN}$1${NC}"; echo -e "${CYAN}━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━${NC}\n"; }

echo "=== Clear Transaction Pools on All Nodes ==="
echo ""

# Function to clear transaction pool for a node
clear_node_pool() {
    local VMID=$1
    local HOST=$2
    local NODE_TYPE=$3
    local SSH_TARGET="${PROXMOX_USER}@${HOST}"

    log_info "Clearing transaction pool for $NODE_TYPE (VMID $VMID on $HOST)..."

    # Stop the service (timeout each stop — Besu can hang on SIGTERM during heavy I/O)
    log_info "  Stopping service..."
    ssh -o ConnectTimeout=10 -o StrictHostKeyChecking=no "$SSH_TARGET" \
        "pct exec $VMID -- bash -c '
          stop_besu() {
            local n=\"\$1\"
            timeout 90 systemctl stop \"\${n}.service\" 2>/dev/null || timeout 90 systemctl stop \"\$n\" 2>/dev/null || true
          }
          stop_besu besu-validator
          stop_besu besu-rpc-core
          stop_besu besu-rpc
          stop_besu besu-sentry
        '" 2>&1 | grep -v "Configuration file" || true

    sleep 2

    # Find and clear transaction pool database
    log_info "  Clearing transaction pool database..."
    CLEAR_RESULT=$(ssh -o ConnectTimeout=10 -o StrictHostKeyChecking=no "$SSH_TARGET" \
        "pct exec $VMID -- bash -c '
        DATA_DIRS=\"/data/besu /var/lib/besu\"
        for DATA_DIR in \$DATA_DIRS; do
            if [ -d \"\$DATA_DIR\" ]; then
                # Find transaction pool database files
                find \"\$DATA_DIR\" -type d -name \"*pool*\" -exec rm -rf {} \; 2>/dev/null || true
                find \"\$DATA_DIR\" -type f -name \"*transaction*\" -delete 2>/dev/null || true
                find \"\$DATA_DIR\" -type f -name \"*txpool*\" -delete 2>/dev/null || true
                echo \"Cleared: \$DATA_DIR\"
            fi
        done
        '" 2>&1 | grep -v "Configuration file" || echo "Cleared")

    if [ -n "$CLEAR_RESULT" ]; then
        log_success "  Transaction pool cleared"
    else
        log_warn "  Could not clear transaction pool (may not exist)"
    fi

    # Restart the service (first successful start wins; timeout avoids indefinite hang)
    log_info "  Restarting service..."
    ssh -o ConnectTimeout=10 -o StrictHostKeyChecking=no "$SSH_TARGET" \
        "pct exec $VMID -- bash -c '
          timeout 120 systemctl start besu-validator.service 2>/dev/null || timeout 120 systemctl start besu-validator 2>/dev/null || \
          timeout 120 systemctl start besu-rpc-core.service 2>/dev/null || timeout 120 systemctl start besu-rpc-core 2>/dev/null || \
          timeout 120 systemctl start besu-rpc.service 2>/dev/null || timeout 120 systemctl start besu-rpc 2>/dev/null || \
          timeout 120 systemctl start besu-sentry.service 2>/dev/null || timeout 120 systemctl start besu-sentry 2>/dev/null || true
        '" 2>&1 | grep -v "Configuration file" || true

    sleep 3

    # Verify at least one Besu unit is active (single line — avoids inactive\\ninactive\\nactive noise)
    STATUS=$(ssh -o ConnectTimeout=5 -o StrictHostKeyChecking=no "$SSH_TARGET" \
        "pct exec $VMID -- bash -c '
          for u in besu-validator besu-rpc-core besu-rpc besu-sentry; do
            for s in \"\$u\" \"\${u}.service\"; do
              st=\$(systemctl is-active \"\$s\" 2>/dev/null || true)
              [ \"\$st\" = active ] && { echo active; exit 0; }
            done
          done
          echo inactive
        '" 2>&1 | grep -v "Configuration file" | tr -d '\r' | tail -1) \
        || STATUS="unknown"

    if [ "$STATUS" = "active" ]; then
        log_success "  Service restarted and active"
    else
        log_warn "  Service status: $STATUS"
    fi

    echo ""
}

# Clear txpool files while CT is stopped — mount rootfs from PVE host (reliable for sentries / Alltra RPC).
clear_peer_txpool_via_pct_mount() {
    local VMID=$1
    local HOST=$2
    local LABEL=$3
    local SSH_TARGET="${PROXMOX_USER}@${HOST}"

    log_info "Clearing $LABEL (VMID $VMID on $HOST) via pct stop + rootfs wipe..."
    # shellcheck disable=SC2087
    if ssh -o ConnectTimeout=15 -o StrictHostKeyChecking=no "$SSH_TARGET" bash -s <<EOF
set +e
VMID=$VMID
echo "  [pve] pct stop \$VMID (timeout 300s)"
timeout 300 pct stop "\$VMID" 2>/dev/null || true
for i in \$(seq 1 90); do
  if pct status "\$VMID" 2>/dev/null | grep -qi stopped; then
    echo "  [pve] CT \$VMID stopped"
    break
  fi
  sleep 2
done
if ! pct status "\$VMID" 2>/dev/null | grep -qi stopped; then
  echo "  [pve] WARN: CT \$VMID not stopped after wait — cannot wipe pool safely"
  timeout 120 pct start "\$VMID" 2>/dev/null || true
  exit 1
fi
sleep 2
# After pct stop, /var/lib/lxc/<vmid>/rootfs often exists but is empty until pct mount binds the CT disk (LVM/ZFS).
# Using the path without mount caused silent no-ops on r630-03 mesh CTs (no "cleared under" lines).
MP="/var/lib/lxc/\${VMID}/rootfs"
pct unmount "\$VMID" 2>/dev/null || true
if ! pct mount "\$VMID" 2>/dev/null; then
  echo "  [pve] WARN: pct mount \$VMID failed — cannot wipe pool safely"
  timeout 120 pct start "\$VMID" 2>/dev/null || true
  exit 1
fi
echo "  [pve] pct mount bound \$MP"
if [ ! -d "\$MP" ]; then
  echo "  [pve] WARN: no rootfs at \$MP after mount"
  pct unmount "\$VMID" 2>/dev/null || true
  timeout 120 pct start "\$VMID" 2>/dev/null || true
  exit 1
fi
for dd in "\$MP/data/besu" "\$MP/var/lib/besu" "\$MP/opt/besu"; do
  if [ -d "\$dd" ]; then
    find "\$dd" -type d -name "*pool*" -exec rm -rf {} \; 2>/dev/null || true
    find "\$dd" -type f -name "*transaction*" -delete 2>/dev/null || true
    find "\$dd" -type f -name "*txpool*" -delete 2>/dev/null || true
    echo "  [pve] cleared under \$dd"
  fi
done
pct unmount "\$VMID" 2>/dev/null || true
sleep 2
echo "  [pve] pct start \$VMID"
timeout 180 pct start "\$VMID" 2>/dev/null || true
sleep 2
exit 0
EOF
    then
        log_success "  Peer CT $VMID — pool wipe via mount complete"
    else
        log_warn "  Peer CT $VMID — mount path had issues; confirm: ssh ${SSH_TARGET} 'pct status $VMID'"
    fi
    echo ""
}

# Peer tier first (when enabled) so validators/RPC are not refilled from sentries mid-run.
if [[ "$CLEAR_BESU_PEER_TXPOOLS" == "1" ]]; then
    log_section "Clearing Sentry transaction pools (1500–1502) — pct mount on host"
    for vmid in 1500 1501 1502; do
        if ssh -o ConnectTimeout=5 -o StrictHostKeyChecking=no "${PROXMOX_USER}@${PROXMOX_R630}" \
            "pct list | grep -q '^${vmid} '" 2>/dev/null; then
            clear_peer_txpool_via_pct_mount "$vmid" "$PROXMOX_R630" "Sentry"
        else
            log_warn "Sentry VMID $vmid not found on ${PROXMOX_R630}"
        fi
    done

    log_section "Clearing current edge RPC pools (2420/2430/2440/2460/2470/2480) — pct mount on host"
    for vmid in 2420 2430 2440 2460 2470 2480; do
        if ssh -o ConnectTimeout=5 -o StrictHostKeyChecking=no "${PROXMOX_USER}@${PROXMOX_R630}" \
            "pct list | grep -q '^${vmid} '" 2>/dev/null; then
            clear_peer_txpool_via_pct_mount "$vmid" "$PROXMOX_R630" "Besu RPC (Alltra/Hybx)"
        else
            log_warn "VMID $vmid not found on ${PROXMOX_R630}"
        fi
    done

    # r630-03: validators 1003–1004, sentries 1503–1508, Core2 2102, Fireblocks 2301/2304, ThirdWeb stack 2400/2402/2403
    log_section "Clearing Besu mesh on r630-03 (1503–1508, 2102, 2301, 2304, 2400–2403)"
    for vmid in 1503 1504 1505 1506 1507 1508 2102 2301 2304 2400 2402 2403; do
        if ssh -o ConnectTimeout=5 -o StrictHostKeyChecking=no "${PROXMOX_USER}@${R630_03}" \
            "pct list | grep -q '^${vmid} '" 2>/dev/null; then
            clear_peer_txpool_via_pct_mount "$vmid" "$R630_03" "Besu mesh r630-03 VMID $vmid"
        else
            log_warn "VMID $vmid not found on ${R630_03}"
        fi
    done

    # r630-02: public 2201 + named RPC / ThirdWeb helper CTs (same P2P mesh)
    log_section "Clearing Besu mesh on r630-02 (2201, 2303, 2305–2308, 2401)"
    for vmid in 2201 2303 2305 2306 2307 2308 2401; do
        if ssh -o ConnectTimeout=5 -o StrictHostKeyChecking=no "${PROXMOX_USER}@${R630_02}" \
            "pct list | grep -q '^${vmid} '" 2>/dev/null; then
            clear_peer_txpool_via_pct_mount "$vmid" "$R630_02" "Besu mesh r630-02 VMID $vmid"
        else
            log_warn "VMID $vmid not found on ${R630_02}"
        fi
    done
else
    log_info "Skipping sentry + Alltra/Hybx pool clear (set CLEAR_BESU_PEER_TXPOOLS=1 if stuck txs reappear on Core after a clear)."
fi

# Clear validators
log_section "Clearing Validator Transaction Pools"

# Validators: 1000–1002 on r630-01; 1003–1004 on r630-03 (see ALL_VMIDS_ENDPOINTS.md).
VALIDATORS=(
    "1000:$PROXMOX_R630:Validator"
    "1001:$PROXMOX_R630:Validator"
    "1002:$PROXMOX_R630:Validator"
    "1003:$R630_03:Validator"
    "1004:$R630_03:Validator"
)

for validator in "${VALIDATORS[@]}"; do
    IFS=':' read -r VMID HOST TYPE <<< "$validator"
    clear_node_pool "$VMID" "$HOST" "$TYPE"
done

# Clear RPC Core (2101)
log_section "Clearing RPC Transaction Pool (2101)"

if ssh -o ConnectTimeout=5 -o StrictHostKeyChecking=no "${PROXMOX_USER}@${PROXMOX_ML110}" \
    "pct list | grep -q '2101'" 2>/dev/null; then
    clear_node_pool 2101 "$PROXMOX_ML110" "RPC"
elif ssh -o ConnectTimeout=5 -o StrictHostKeyChecking=no "${PROXMOX_USER}@${PROXMOX_R630}" \
    "pct list | grep -q '2101'" 2>/dev/null; then
    clear_node_pool 2101 "$PROXMOX_R630" "RPC"
else
    log_warn "RPC node (2101) not found on either host"
fi

# Clear RPC Core Thirdweb admin (2103) — r630-01 per ALL_VMIDS_ENDPOINTS.md
log_section "Clearing RPC Core Thirdweb admin (2103)"
if ssh -o ConnectTimeout=5 -o StrictHostKeyChecking=no "${PROXMOX_USER}@${PROXMOX_R630}" \
    "pct list | grep -q '2103'" 2>/dev/null; then
    clear_node_pool 2103 "$PROXMOX_R630" "RPC Thirdweb Core"
else
    log_warn "RPC Thirdweb Core (2103) not found on ${PROXMOX_R630}"
fi

# 2102 (r630-03) and 2201 (r630-02) are cleared in the peer-tier pct-mount pass when CLEAR_BESU_PEER_TXPOOLS=1
if [[ "$CLEAR_BESU_PEER_TXPOOLS" != "1" ]]; then
    log_section "Clearing RPC Core 2 (2102)"
    if ssh -o ConnectTimeout=5 -o StrictHostKeyChecking=no "${PROXMOX_USER}@${R630_03}" \
        "pct list | grep -q '2102'" 2>/dev/null; then
        clear_node_pool 2102 "$R630_03" "RPC Core 2"
    else
        log_warn "RPC Core 2 (2102) not found on ${R630_03}"
    fi

    log_section "Clearing RPC Public (2201)"
    if ssh -o ConnectTimeout=5 -o StrictHostKeyChecking=no "${PROXMOX_USER}@${R630_02}" \
        "pct list | grep -q '2201'" 2>/dev/null; then
        clear_node_pool 2201 "$R630_02" "RPC Public"
    else
        log_warn "RPC Public (2201) not found on ${R630_02}"
    fi
else
    log_info "Skipping duplicate 2102/2201 clear_node_pool (already wiped in peer-tier pass)."
fi

log_section "Transaction Pool Clear Complete"

echo "Next steps:"
echo "  1. Wait 30-60 seconds for nodes to fully restart"
echo "  2. Check pending transactions: bash scripts/verify/check-pending-transactions-chain138.sh"
echo "  3. Monitor health: bash scripts/monitoring/monitor-blockchain-health.sh"