Files
proxmox/scripts/clear-all-transaction-pools.sh
defiQUG dbd517b279 Sync workspace: config, docs, scripts, CI, operator rules, and submodule pointers.
- Update dbis_core, cross-chain-pmm-lps, explorer-monorepo, metamask-integration, pr-workspace/chains
- Omit embedded publish git dirs and empty placeholders from index

Made-with: Cursor
2026-04-12 06:12:20 -07:00

309 lines
13 KiB
Bash
Executable File
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env bash
# Clear transaction pools on all Besu nodes (validators, Core/Thirdweb/public RPC).
#
# Stuck txs often reappear on Core when sentries or Alltra/Hybx RPC CTs (still on the P2P mesh) were not cleared.
# To include those: CLEAR_BESU_PEER_TXPOOLS=1 bash scripts/clear-all-transaction-pools.sh
# Peer tier (15001502, 2420/2430/2440/2460/2470/2480) uses pct stop + pct mount on the PVE host (avoids hung systemctl inside flaky CTs).
#
# SSH: uses PROXMOX_SSH_USER from config/ip-addresses.conf (root). If .env sets PROXMOX_USER=root@pam for the API,
# that value is NOT used for SSH (see PROXMOX_USER= assignment below).
# Afterward: Core RPC (2101) can take several minutes to bind JSON-RPC while RocksDB opens/compacts; public RPC may recover first.
set -euo pipefail
# Load IP configuration
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
PROJECT_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
source "${PROJECT_ROOT}/config/ip-addresses.conf" 2>/dev/null || true
# Peer-tier Besu CTs (sentries, Alltra/Hybx RPC) retain mempool and can re-gossip txs to Core after a clear.
# Default: skip them (faster). To flush those pools too: CLEAR_BESU_PEER_TXPOOLS=1 bash scripts/clear-all-transaction-pools.sh
CLEAR_BESU_PEER_TXPOOLS="${CLEAR_BESU_PEER_TXPOOLS:-0}"
# Shell SSH must be root@host — not root@pam@host (.env often sets PROXMOX_USER=root@pam for API).
PROXMOX_SSH_USER="${PROXMOX_SSH_USER:-root}"
[[ "$PROXMOX_SSH_USER" == *"@"* ]] && PROXMOX_SSH_USER="root"
PROXMOX_USER="$PROXMOX_SSH_USER"
PROXMOX_ML110="${PROXMOX_ML110:-${PROXMOX_HOST_ML110:-192.168.11.10}}"
PROXMOX_R630="${PROXMOX_R630:-${PROXMOX_R630_01:-${PROXMOX_HOST_R630_01:-192.168.11.11}}}"
R630_03="${PROXMOX_R630_03:-${PROXMOX_HOST_R630_03:-192.168.11.13}}"
R630_02="${PROXMOX_R630_02:-${PROXMOX_HOST_R630_02:-192.168.11.12}}"
# Colors
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
BLUE='\033[0;34m'
CYAN='\033[0;36m'
NC='\033[0m'
log_info() { echo -e "${BLUE}[INFO]${NC} $1"; }
log_success() { echo -e "${GREEN}[✓]${NC} $1"; }
log_warn() { echo -e "${YELLOW}[⚠]${NC} $1"; }
log_error() { echo -e "${RED}[✗]${NC} $1"; }
log_section() { echo -e "\n${CYAN}━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━${NC}"; echo -e "${CYAN}$1${NC}"; echo -e "${CYAN}━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━${NC}\n"; }
echo "=== Clear Transaction Pools on All Nodes ==="
echo ""
# Function to clear transaction pool for a node
clear_node_pool() {
local VMID=$1
local HOST=$2
local NODE_TYPE=$3
local SSH_TARGET="${PROXMOX_USER}@${HOST}"
log_info "Clearing transaction pool for $NODE_TYPE (VMID $VMID on $HOST)..."
# Stop the service (timeout each stop — Besu can hang on SIGTERM during heavy I/O)
log_info " Stopping service..."
ssh -o ConnectTimeout=10 -o StrictHostKeyChecking=no "$SSH_TARGET" \
"pct exec $VMID -- bash -c '
stop_besu() {
local n=\"\$1\"
timeout 90 systemctl stop \"\${n}.service\" 2>/dev/null || timeout 90 systemctl stop \"\$n\" 2>/dev/null || true
}
stop_besu besu-validator
stop_besu besu-rpc-core
stop_besu besu-rpc
stop_besu besu-sentry
'" 2>&1 | grep -v "Configuration file" || true
sleep 2
# Find and clear transaction pool database
log_info " Clearing transaction pool database..."
CLEAR_RESULT=$(ssh -o ConnectTimeout=10 -o StrictHostKeyChecking=no "$SSH_TARGET" \
"pct exec $VMID -- bash -c '
DATA_DIRS=\"/data/besu /var/lib/besu\"
for DATA_DIR in \$DATA_DIRS; do
if [ -d \"\$DATA_DIR\" ]; then
# Find transaction pool database files
find \"\$DATA_DIR\" -type d -name \"*pool*\" -exec rm -rf {} \; 2>/dev/null || true
find \"\$DATA_DIR\" -type f -name \"*transaction*\" -delete 2>/dev/null || true
find \"\$DATA_DIR\" -type f -name \"*txpool*\" -delete 2>/dev/null || true
echo \"Cleared: \$DATA_DIR\"
fi
done
'" 2>&1 | grep -v "Configuration file" || echo "Cleared")
if [ -n "$CLEAR_RESULT" ]; then
log_success " Transaction pool cleared"
else
log_warn " Could not clear transaction pool (may not exist)"
fi
# Restart the service (first successful start wins; timeout avoids indefinite hang)
log_info " Restarting service..."
ssh -o ConnectTimeout=10 -o StrictHostKeyChecking=no "$SSH_TARGET" \
"pct exec $VMID -- bash -c '
timeout 120 systemctl start besu-validator.service 2>/dev/null || timeout 120 systemctl start besu-validator 2>/dev/null || \
timeout 120 systemctl start besu-rpc-core.service 2>/dev/null || timeout 120 systemctl start besu-rpc-core 2>/dev/null || \
timeout 120 systemctl start besu-rpc.service 2>/dev/null || timeout 120 systemctl start besu-rpc 2>/dev/null || \
timeout 120 systemctl start besu-sentry.service 2>/dev/null || timeout 120 systemctl start besu-sentry 2>/dev/null || true
'" 2>&1 | grep -v "Configuration file" || true
sleep 3
# Verify at least one Besu unit is active (single line — avoids inactive\\ninactive\\nactive noise)
STATUS=$(ssh -o ConnectTimeout=5 -o StrictHostKeyChecking=no "$SSH_TARGET" \
"pct exec $VMID -- bash -c '
for u in besu-validator besu-rpc-core besu-rpc besu-sentry; do
for s in \"\$u\" \"\${u}.service\"; do
st=\$(systemctl is-active \"\$s\" 2>/dev/null || true)
[ \"\$st\" = active ] && { echo active; exit 0; }
done
done
echo inactive
'" 2>&1 | grep -v "Configuration file" | tr -d '\r' | tail -1) \
|| STATUS="unknown"
if [ "$STATUS" = "active" ]; then
log_success " Service restarted and active"
else
log_warn " Service status: $STATUS"
fi
echo ""
}
# Clear txpool files while CT is stopped — mount rootfs from PVE host (reliable for sentries / Alltra RPC).
clear_peer_txpool_via_pct_mount() {
local VMID=$1
local HOST=$2
local LABEL=$3
local SSH_TARGET="${PROXMOX_USER}@${HOST}"
log_info "Clearing $LABEL (VMID $VMID on $HOST) via pct stop + rootfs wipe..."
# shellcheck disable=SC2087
if ssh -o ConnectTimeout=15 -o StrictHostKeyChecking=no "$SSH_TARGET" bash -s <<EOF
set +e
VMID=$VMID
echo " [pve] pct stop \$VMID (timeout 300s)"
timeout 300 pct stop "\$VMID" 2>/dev/null || true
for i in \$(seq 1 90); do
if pct status "\$VMID" 2>/dev/null | grep -qi stopped; then
echo " [pve] CT \$VMID stopped"
break
fi
sleep 2
done
if ! pct status "\$VMID" 2>/dev/null | grep -qi stopped; then
echo " [pve] WARN: CT \$VMID not stopped after wait — cannot wipe pool safely"
timeout 120 pct start "\$VMID" 2>/dev/null || true
exit 1
fi
sleep 2
# After pct stop, /var/lib/lxc/<vmid>/rootfs often exists but is empty until pct mount binds the CT disk (LVM/ZFS).
# Using the path without mount caused silent no-ops on r630-03 mesh CTs (no "cleared under" lines).
MP="/var/lib/lxc/\${VMID}/rootfs"
pct unmount "\$VMID" 2>/dev/null || true
if ! pct mount "\$VMID" 2>/dev/null; then
echo " [pve] WARN: pct mount \$VMID failed — cannot wipe pool safely"
timeout 120 pct start "\$VMID" 2>/dev/null || true
exit 1
fi
echo " [pve] pct mount bound \$MP"
if [ ! -d "\$MP" ]; then
echo " [pve] WARN: no rootfs at \$MP after mount"
pct unmount "\$VMID" 2>/dev/null || true
timeout 120 pct start "\$VMID" 2>/dev/null || true
exit 1
fi
for dd in "\$MP/data/besu" "\$MP/var/lib/besu" "\$MP/opt/besu"; do
if [ -d "\$dd" ]; then
find "\$dd" -type d -name "*pool*" -exec rm -rf {} \; 2>/dev/null || true
find "\$dd" -type f -name "*transaction*" -delete 2>/dev/null || true
find "\$dd" -type f -name "*txpool*" -delete 2>/dev/null || true
echo " [pve] cleared under \$dd"
fi
done
pct unmount "\$VMID" 2>/dev/null || true
sleep 2
echo " [pve] pct start \$VMID"
timeout 180 pct start "\$VMID" 2>/dev/null || true
sleep 2
exit 0
EOF
then
log_success " Peer CT $VMID — pool wipe via mount complete"
else
log_warn " Peer CT $VMID — mount path had issues; confirm: ssh ${SSH_TARGET} 'pct status $VMID'"
fi
echo ""
}
# Peer tier first (when enabled) so validators/RPC are not refilled from sentries mid-run.
if [[ "$CLEAR_BESU_PEER_TXPOOLS" == "1" ]]; then
log_section "Clearing Sentry transaction pools (15001502) — pct mount on host"
for vmid in 1500 1501 1502; do
if ssh -o ConnectTimeout=5 -o StrictHostKeyChecking=no "${PROXMOX_USER}@${PROXMOX_R630}" \
"pct list | grep -q '^${vmid} '" 2>/dev/null; then
clear_peer_txpool_via_pct_mount "$vmid" "$PROXMOX_R630" "Sentry"
else
log_warn "Sentry VMID $vmid not found on ${PROXMOX_R630}"
fi
done
log_section "Clearing current edge RPC pools (2420/2430/2440/2460/2470/2480) — pct mount on host"
for vmid in 2420 2430 2440 2460 2470 2480; do
if ssh -o ConnectTimeout=5 -o StrictHostKeyChecking=no "${PROXMOX_USER}@${PROXMOX_R630}" \
"pct list | grep -q '^${vmid} '" 2>/dev/null; then
clear_peer_txpool_via_pct_mount "$vmid" "$PROXMOX_R630" "Besu RPC (Alltra/Hybx)"
else
log_warn "VMID $vmid not found on ${PROXMOX_R630}"
fi
done
# r630-03: validators 10031004, sentries 15031508, Core2 2102, Fireblocks 2301/2304, ThirdWeb stack 2400/2402/2403
log_section "Clearing Besu mesh on r630-03 (15031508, 2102, 2301, 2304, 24002403)"
for vmid in 1503 1504 1505 1506 1507 1508 2102 2301 2304 2400 2402 2403; do
if ssh -o ConnectTimeout=5 -o StrictHostKeyChecking=no "${PROXMOX_USER}@${R630_03}" \
"pct list | grep -q '^${vmid} '" 2>/dev/null; then
clear_peer_txpool_via_pct_mount "$vmid" "$R630_03" "Besu mesh r630-03 VMID $vmid"
else
log_warn "VMID $vmid not found on ${R630_03}"
fi
done
# r630-02: public 2201 + named RPC / ThirdWeb helper CTs (same P2P mesh)
log_section "Clearing Besu mesh on r630-02 (2201, 2303, 23052308, 2401)"
for vmid in 2201 2303 2305 2306 2307 2308 2401; do
if ssh -o ConnectTimeout=5 -o StrictHostKeyChecking=no "${PROXMOX_USER}@${R630_02}" \
"pct list | grep -q '^${vmid} '" 2>/dev/null; then
clear_peer_txpool_via_pct_mount "$vmid" "$R630_02" "Besu mesh r630-02 VMID $vmid"
else
log_warn "VMID $vmid not found on ${R630_02}"
fi
done
else
log_info "Skipping sentry + Alltra/Hybx pool clear (set CLEAR_BESU_PEER_TXPOOLS=1 if stuck txs reappear on Core after a clear)."
fi
# Clear validators
log_section "Clearing Validator Transaction Pools"
# Validators: 10001002 on r630-01; 10031004 on r630-03 (see ALL_VMIDS_ENDPOINTS.md).
VALIDATORS=(
"1000:$PROXMOX_R630:Validator"
"1001:$PROXMOX_R630:Validator"
"1002:$PROXMOX_R630:Validator"
"1003:$R630_03:Validator"
"1004:$R630_03:Validator"
)
for validator in "${VALIDATORS[@]}"; do
IFS=':' read -r VMID HOST TYPE <<< "$validator"
clear_node_pool "$VMID" "$HOST" "$TYPE"
done
# Clear RPC Core (2101)
log_section "Clearing RPC Transaction Pool (2101)"
if ssh -o ConnectTimeout=5 -o StrictHostKeyChecking=no "${PROXMOX_USER}@${PROXMOX_ML110}" \
"pct list | grep -q '2101'" 2>/dev/null; then
clear_node_pool 2101 "$PROXMOX_ML110" "RPC"
elif ssh -o ConnectTimeout=5 -o StrictHostKeyChecking=no "${PROXMOX_USER}@${PROXMOX_R630}" \
"pct list | grep -q '2101'" 2>/dev/null; then
clear_node_pool 2101 "$PROXMOX_R630" "RPC"
else
log_warn "RPC node (2101) not found on either host"
fi
# Clear RPC Core Thirdweb admin (2103) — r630-01 per ALL_VMIDS_ENDPOINTS.md
log_section "Clearing RPC Core Thirdweb admin (2103)"
if ssh -o ConnectTimeout=5 -o StrictHostKeyChecking=no "${PROXMOX_USER}@${PROXMOX_R630}" \
"pct list | grep -q '2103'" 2>/dev/null; then
clear_node_pool 2103 "$PROXMOX_R630" "RPC Thirdweb Core"
else
log_warn "RPC Thirdweb Core (2103) not found on ${PROXMOX_R630}"
fi
# 2102 (r630-03) and 2201 (r630-02) are cleared in the peer-tier pct-mount pass when CLEAR_BESU_PEER_TXPOOLS=1
if [[ "$CLEAR_BESU_PEER_TXPOOLS" != "1" ]]; then
log_section "Clearing RPC Core 2 (2102)"
if ssh -o ConnectTimeout=5 -o StrictHostKeyChecking=no "${PROXMOX_USER}@${R630_03}" \
"pct list | grep -q '2102'" 2>/dev/null; then
clear_node_pool 2102 "$R630_03" "RPC Core 2"
else
log_warn "RPC Core 2 (2102) not found on ${R630_03}"
fi
log_section "Clearing RPC Public (2201)"
if ssh -o ConnectTimeout=5 -o StrictHostKeyChecking=no "${PROXMOX_USER}@${R630_02}" \
"pct list | grep -q '2201'" 2>/dev/null; then
clear_node_pool 2201 "$R630_02" "RPC Public"
else
log_warn "RPC Public (2201) not found on ${R630_02}"
fi
else
log_info "Skipping duplicate 2102/2201 clear_node_pool (already wiped in peer-tier pass)."
fi
log_section "Transaction Pool Clear Complete"
echo "Next steps:"
echo " 1. Wait 30-60 seconds for nodes to fully restart"
echo " 2. Check pending transactions: bash scripts/verify/check-pending-transactions-chain138.sh"
echo " 3. Monitor health: bash scripts/monitoring/monitor-blockchain-health.sh"