chore: update submodule references and documentation

- Marked submodules ai-mcp-pmm-controller, explorer-monorepo, and smom-dbis-138 as dirty to reflect recent changes. - Updated documentation to clarify operator script usage, including dotenv loading and task execution instructions. - Enhanced the README and various index files to provide clearer navigation and task completion guidance. Made-with: Cursor
2026-03-04 02:03:08 -08:00
parent 70eadb7bf0
commit e4c9dda0fd
246 changed files with 17774 additions and 93 deletions
--- a/scripts/maintenance/ensure-core-rpc-config-2101-2102.sh
+++ b/scripts/maintenance/ensure-core-rpc-config-2101-2102.sh
@@ -0,0 +1,80 @@
+#!/usr/bin/env bash
+# Ensure Core RPC nodes 2101 and 2102 have TXPOOL and ADMIN (and DEBUG) in rpc-http-api and rpc-ws-api.
+# Does NOT add txpool_besuClear/txpool_clear/admin_removeTransaction — Besu does not implement them.
+# See: docs/04-configuration/CORE_RPC_2101_2102_TXPOOL_ADMIN_STATUS.md
+#
+# Usage: ./scripts/maintenance/ensure-core-rpc-config-2101-2102.sh [--dry-run] [--2101-only] [--2102-only]
+
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+PROJECT_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)"
+source "${PROJECT_ROOT}/config/ip-addresses.conf" 2>/dev/null || true
+
+# Canonical API list for Core RPC (max that Besu supports for txpool + admin)
+RPC_HTTP_API='["ETH","NET","WEB3","TXPOOL","QBFT","ADMIN","DEBUG","TRACE"]'
+RPC_WS_API='["ETH","NET","WEB3","TXPOOL","QBFT","ADMIN"]'
+
+VMID_2101=2101
+VMID_2102=2102
+HOST_2101="${PROXMOX_HOST_R630_01:-192.168.11.11}"
+HOST_2102="${PROXMOX_HOST_ML110:-192.168.11.10}"
+CONFIG_2101="/etc/besu/config-rpc-core.toml"
+CONFIG_2102="/etc/besu/config-rpc.toml"
+
+DRY_RUN=false
+ONLY_2101=false
+ONLY_2102=false
+for a in "$@"; do
+  [[ "$a" == "--dry-run" ]] && DRY_RUN=true
+  [[ "$a" == "--2101-only" ]] && ONLY_2101=true
+  [[ "$a" == "--2102-only" ]] && ONLY_2102=true
+done
+
+run_ssh() { ssh -o ConnectTimeout=10 -o StrictHostKeyChecking=no root@"$1" "$2"; }
+log_ok() { echo -e "\033[0;32m[✓]\033[0m $1"; }
+log_info() { echo -e "\033[0;34m[INFO]\033[0m $1"; }
+log_warn() { echo -e "\033[0;33m[⚠]\033[0m $1"; }
+
+ensure_apis() {
+  local vmid=$1
+  local host=$2
+  local config_path=$3
+  log_info "VMID $vmid ($host): ensuring $config_path has TXPOOL, ADMIN, DEBUG..."
+  if $DRY_RUN; then
+    echo "  Would set rpc-http-api and rpc-ws-api to include TXPOOL, ADMIN, DEBUG, QBFT, TRACE (2101/2102)"
+    return 0
+  fi
+  # Pass API lists via env so quoting is safe; remote sed updates the config
+  run_ssh "$host" "pct exec $vmid -- env RPC_HTTP_API='$RPC_HTTP_API' RPC_WS_API='$RPC_WS_API' CFG='$config_path' bash -c '
+    set -e
+    [ -f \"\$CFG\" ] || { echo \"Config \$CFG not found\"; exit 1; }
+    cp \"\$CFG\" \"\${CFG}.bak.\$(date +%Y%m%d%H%M%S)\"
+    grep -q \"rpc-http-api\" \"\$CFG\" && sed -i \"s|^rpc-http-api=.*|rpc-http-api=\$RPC_HTTP_API|\" \"\$CFG\" || echo \"rpc-http-api=\$RPC_HTTP_API\" >> \"\$CFG\"
+    grep -q \"rpc-ws-api\" \"\$CFG\" && sed -i \"s|^rpc-ws-api=.*|rpc-ws-api=\$RPC_WS_API|\" \"\$CFG\" || echo \"rpc-ws-api=\$RPC_WS_API\" >> \"\$CFG\"
+    chown besu:besu \"\$CFG\" 2>/dev/null || true
+    echo OK
+  '" 2>/dev/null || { log_warn "VMID $vmid: SSH or config update failed"; return 1; }
+  log_ok "VMID $vmid: config updated"
+  log_info "Restarting besu-rpc on $vmid..."
+  run_ssh "$host" "pct exec $vmid -- systemctl restart besu-rpc 2>/dev/null || pct exec $vmid -- systemctl restart besu-rpc.service 2>/dev/null" || { log_warn "Restart failed for $vmid"; return 1; }
+  log_ok "VMID $vmid: besu-rpc restarted"
+  return 0
+}
+
+echo ""
+echo "=== Ensure Core RPC 2101 / 2102 — TXPOOL + ADMIN (max Besu supports) ==="
+echo "  dry-run=$DRY_RUN  2101-only=$ONLY_2101  2102-only=$ONLY_2102"
+echo "  Note: txpool_besuClear, txpool_clear, admin_removeTransaction are NOT in Besu; use clear-all-transaction-pools.sh to clear stuck txs."
+echo ""
+
+if [[ "$ONLY_2102" != true ]]; then
+  ensure_apis "$VMID_2101" "$HOST_2101" "$CONFIG_2101" || true
+fi
+if [[ "$ONLY_2101" != true ]]; then
+  ensure_apis "$VMID_2102" "$HOST_2102" "$CONFIG_2102" || true
+fi
+
+echo ""
+echo "Done. Verify: ./scripts/maintenance/health-check-rpc-2101.sh and curl to 192.168.11.212:8545 for 2102."
+echo "Ref: docs/04-configuration/CORE_RPC_2101_2102_TXPOOL_ADMIN_STATUS.md"
--- a/scripts/maintenance/fix-block-production-staggered-restart.sh
+++ b/scripts/maintenance/fix-block-production-staggered-restart.sh
@@ -0,0 +1,84 @@
+#!/usr/bin/env bash
+# Staggered restart of Chain 138 validators to restore block production without losing quorum.
+# When all 5 validators are restarted at once (e.g. clear-all-transaction-pools), they can all
+# enter "full sync" and no node is at head to produce blocks. Restarting one at a time lets
+# the rest stay at head so the restarted node syncs quickly and consensus can continue.
+#
+# Usage: ./scripts/maintenance/fix-block-production-staggered-restart.sh [--dry-run]
+# Requires: SSH to Proxmox hosts (192.168.11.10 ML110, 192.168.11.11 R630-01, 192.168.11.12 R630-02)
+
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+PROJECT_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)"
+source "${PROJECT_ROOT}/config/ip-addresses.conf" 2>/dev/null || true
+
+DRY_RUN=false
+[[ "${1:-}" == "--dry-run" ]] && DRY_RUN=true
+
+RED='\033[0;31m'
+GREEN='\033[0;32m'
+YELLOW='\033[1;33m'
+BLUE='\033[0;34m'
+NC='\033[0m'
+log_info() { echo -e "${BLUE}[INFO]${NC} $1"; }
+log_ok() { echo -e "${GREEN}[✓]${NC} $1"; }
+log_warn() { echo -e "${YELLOW}[WARN]${NC} $1"; }
+
+# Order: restart one at a time; wait between so restarted node can sync from others
+# VMID : host
+VALIDATORS=(
+  "1004:${PROXMOX_HOST_ML110:-192.168.11.10}"
+  "1003:${PROXMOX_HOST_ML110:-192.168.11.10}"
+  "1002:${PROXMOX_HOST_R630_01:-192.168.11.11}"
+  "1001:${PROXMOX_HOST_R630_01:-192.168.11.11}"
+  "1000:${PROXMOX_HOST_R630_01:-192.168.11.11}"
+)
+WAIT_BETWEEN=90
+RPC="${RPC_URL_138:-http://192.168.11.211:8545}"
+
+get_block() {
+  curl -s -m 5 -X POST -H "Content-Type: application/json" \
+    -d '{"jsonrpc":"2.0","method":"eth_blockNumber","params":[],"id":1}' "$RPC" 2>/dev/null | jq -r '.result // "0x0"'
+}
+
+echo "=== Staggered validator restart (fix block production) ==="
+echo "  RPC: $RPC"
+echo "  Wait between restarts: ${WAIT_BETWEEN}s"
+$DRY_RUN && echo "  (DRY RUN - no restarts)"
+echo ""
+
+BLOCK_BEFORE=$(get_block)
+log_info "Block before: $BLOCK_BEFORE"
+
+for entry in "${VALIDATORS[@]}"; do
+  IFS=: read -r vmid host <<< "$entry"
+  log_info "Restarting validator $vmid on $host..."
+  if $DRY_RUN; then
+    echo "  Would: ssh root@$host 'pct exec $vmid -- systemctl restart besu-validator'"
+  else
+    # Allow up to 120s for restart (Besu stop/start can take 1-2 min)
+    if timeout 120 ssh -o ConnectTimeout=10 -o StrictHostKeyChecking=no root@"$host" "pct exec $vmid -- systemctl restart besu-validator" 2>/dev/null; then
+      log_ok "  $vmid restarted"
+    else
+      log_warn "  $vmid restart timed out or failed (node may still be restarting)"
+    fi
+  fi
+  if ! $DRY_RUN && [[ "$vmid" != "1000" ]]; then
+    log_info "  Waiting ${WAIT_BETWEEN}s for node to rejoin and sync..."
+    sleep "$WAIT_BETWEEN"
+  fi
+done
+
+if ! $DRY_RUN; then
+  log_info "Waiting 30s then checking block production..."
+  sleep 30
+  BLOCK_AFTER=$(get_block)
+  log_info "Block after: $BLOCK_AFTER"
+  echo ""
+  echo "Run monitor to confirm blocks are advancing:"
+  echo "  ./scripts/monitoring/monitor-blockchain-health.sh"
+  echo "  watch -n 5 'cast block-number --rpc-url $RPC'"
+fi
+
+log_ok "Done."
--- a/scripts/maintenance/proxmox-load-balance-suggest.sh
+++ b/scripts/maintenance/proxmox-load-balance-suggest.sh
@@ -0,0 +1,60 @@
+#!/usr/bin/env bash
+# Suggest load-balancing migrations: show current load and example commands to move
+# containers from r630-01 to r630-02 (or ml110). Run from project root.
+#
+# Usage: bash scripts/maintenance/proxmox-load-balance-suggest.sh
+
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+PROJECT_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)"
+[[ -f "${PROJECT_ROOT}/config/ip-addresses.conf" ]] && source "${PROJECT_ROOT}/config/ip-addresses.conf" 2>/dev/null || true
+
+R630_01="${PROXMOX_HOST_R630_01:-192.168.11.11}"
+R630_02="${PROXMOX_HOST_R630_02:-192.168.11.12}"
+ML110="${PROXMOX_HOST_ML110:-192.168.11.10}"
+SSH_OPTS="-o ConnectTimeout=8 -o StrictHostKeyChecking=no"
+
+# Candidates safe to suggest (r630-01 -> r630-02). Excludes NPMplus main, core RPC, validators, sentries, DBIS core.
+CANDIDATES="3500 3501 7804 8640 8642 10232 10235 10236"
+
+echo ""
+echo "=== Proxmox load balance — suggestion ==="
+echo ""
+
+# Current load and counts
+for entry in "r630-01:$R630_01" "r630-02:$R630_02" "ml110:$ML110"; do
+  IFS=: read -r name ip <<< "$entry"
+  out=$(ssh $SSH_OPTS root@"$ip" "
+    echo \"LOAD|\$(cat /proc/loadavg 2>/dev/null | cut -d' ' -f1-3)\"
+    echo \"LXC|\$(pct list 2>/dev/null | tail -n +2 | wc -l)\"
+  " 2>/dev/null) || true
+  load=$(echo "$out" | awk -F'|' '$1=="LOAD"{print $2}')
+  lxc=$(echo "$out" | awk -F'|' '$1=="LXC"{print $2}')
+  printf "  %-10s %s  LXC: %s\n" "$name" "load: $load" "$lxc"
+done
+
+echo ""
+echo "--- Suggested migrations (r630-01 → r630-02) ---"
+echo "Run from project root. Use --dry-run first. Target storage on r630-02: thin1, thin2, thin5, thin6."
+echo ""
+
+for vmid in $CANDIDATES; do
+  # Check if CT exists on r630-01
+  on_src=$(ssh $SSH_OPTS root@"$R630_01" "pct list 2>/dev/null | awk '\$1==$vmid{print \$1}'" 2>/dev/null) || true
+  if [[ -n "$on_src" ]]; then
+    name=$(ssh $SSH_OPTS root@"$R630_01" "pct config $vmid 2>/dev/null | grep -E '^hostname:|^name:' | head -1 | sed 's/^[^:]*:[[:space:]]*//'" 2>/dev/null) || echo "CT-$vmid"
+    echo "  VMID $vmid ($name):"
+    echo "    ./scripts/maintenance/migrate-ct-r630-01-to-r630-02.sh $vmid thin1 --dry-run"
+    echo "    ./scripts/maintenance/migrate-ct-r630-01-to-r630-02.sh $vmid thin1 --destroy-source"
+    echo ""
+  fi
+done
+
+echo "--- Cluster check (optional) ---"
+echo "If nodes are in the same cluster, you can try live migrate from r630-01:"
+echo "  ssh root@$R630_01 \"pvecm status\""
+echo "  ssh root@$R630_01 \"pct migrate <VMID> r630-02 --storage thin1 --restart\""
+echo ""
+echo "See: docs/04-configuration/PROXMOX_LOAD_BALANCING_RUNBOOK.md"
+echo ""