Files
proxmox/scripts/maintenance/diagnose-and-fix-502s-via-ssh.sh
defiQUG 0d29343941 chore: update .env.master.example with new deployment scripts and treasury manager parameters; enhance AGENTS.md with GRU reference primacy details
- Added new deployment script references for Aave quote-push and treasury manager in .env.master.example.
- Updated AGENTS.md to include information on GRU reference primacy versus public PMM mesh execution model.
- Minor updates to various documentation files to reflect changes in policy and operational guidelines.

Made-with: Cursor
2026-04-12 18:20:41 -07:00

140 lines
6.4 KiB
Bash
Executable File
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env bash
# Deep dive: diagnose and fix every 502 from E2E routing.
# For each known backend (domain → IP:port), SSH to Proxmox, check container + port, fix.
#
# Usage: ./scripts/maintenance/diagnose-and-fix-502s-via-ssh.sh [--apply] [--dry-run] [--diagnose-only]
# Requires: SSH to r630-01, r630-02, ml110 (key-based).
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
PROJECT_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)"
[[ -f "${PROJECT_ROOT}/scripts/lib/load-project-env.sh" ]] && source "${PROJECT_ROOT}/scripts/lib/load-project-env.sh" 2>/dev/null || true
[[ -f "${PROJECT_ROOT}/config/ip-addresses.conf" ]] && source "${PROJECT_ROOT}/config/ip-addresses.conf" 2>/dev/null || true
DRY_RUN=true
DIAGNOSE_ONLY=false
for a in "$@"; do
[[ "$a" == "--dry-run" ]] && DRY_RUN=true
[[ "$a" == "--apply" ]] && DRY_RUN=false
[[ "$a" == "--diagnose-only" ]] && DIAGNOSE_ONLY=true
done
R630_01="${PROXMOX_HOST_R630_01:-192.168.11.11}"
R630_02="${PROXMOX_HOST_R630_02:-192.168.11.12}"
ML110="${PROXMOX_HOST_ML110:-192.168.11.10}"
# Table: domain | backend_ip | backend_port | vmid | host | description
# DBIS (r630-01)
# rpc-http-prv (r630-01)
# MIM4U www (r630-02)
# Edge RPC (r630-01)
# Cacti-alltra/hybx - IPs .177 and .251 (VMID TBD)
BACKENDS=(
"dbis-admin.d-bis.org|192.168.11.130|80|10130|$R630_01|dbis-frontend nginx"
"secure.d-bis.org|192.168.11.130|80|10130|$R630_01|dbis-frontend nginx"
"dbis-api.d-bis.org|192.168.11.155|3000|10150|$R630_01|dbis-api node"
"dbis-api-2.d-bis.org|192.168.11.156|3000|10151|$R630_01|dbis-api node"
"rpc-http-prv.d-bis.org|192.168.11.211|8545|2101|$R630_01|besu RPC"
"www.mim4u.org|192.168.11.37|80|7810|$R630_02|mim-web nginx"
"rpc-alltra.d-bis.org|192.168.11.172|8545|2420|$R630_01|besu alltra"
"rpc-alltra-2.d-bis.org|192.168.11.173|8545|2430|$R630_01|besu alltra"
"rpc-alltra-3.d-bis.org|192.168.11.174|8545|2440|$R630_01|besu alltra"
"rpc-hybx.d-bis.org|192.168.11.246|8545|2460|$R630_01|besu hybx"
"rpc-hybx-2.d-bis.org|192.168.11.247|8545|2470|$R630_01|besu hybx"
"rpc-hybx-3.d-bis.org|192.168.11.248|8545|2480|$R630_01|besu hybx"
"cacti-alltra.d-bis.org|192.168.11.177|80|5201|$R630_02|cacti web"
"cacti-hybx.d-bis.org|192.168.11.251|80|5202|$R630_02|cacti web"
)
run_ssh() { ssh -o ConnectTimeout=10 -o StrictHostKeyChecking=no root@"$1" "$2"; }
log_info() { echo -e "\033[0;34m[INFO]\033[0m $1"; }
log_ok() { echo -e "\033[0;32m[✓]\033[0m $1"; }
log_warn() { echo -e "\033[0;33m[⚠]\033[0m $1"; }
log_err() { echo -e "\033[0;31m[✗]\033[0m $1"; }
echo ""
echo "=== 502 deep dive: diagnose and fix each backend ==="
echo " dry-run=$DRY_RUN diagnose-only=$DIAGNOSE_ONLY"
echo ""
for line in "${BACKENDS[@]}"; do
IFS='|' read -r domain ip port vmid host desc <<< "$line"
[[ -z "$domain" ]] && continue
log_info "--- $domain$ip:$port ($desc) ---"
if [[ -z "$host" ]]; then
log_warn " No host; will try to discover VMID by IP on r630-01/r630-02/ml110"
for h in "$R630_01" "$R630_02" "$ML110"; do
run_ssh "$h" "echo OK" &>/dev/null || continue
list=$(run_ssh "$h" "pct list 2>/dev/null | awk 'NR>1{print \$1}'" 2>/dev/null || true)
for v in $list; do
cip=$(run_ssh "$h" "pct exec $v -- hostname -I 2>/dev/null | awk '{print \$1}'" 2>/dev/null || true)
if [[ "$cip" == "$ip" ]]; then
vmid=$v
host=$h
log_info " Found VMID $vmid on $host"
break 2
fi
done
done
if [[ -z "$host" ]]; then
log_warn " Could not find container for $ip; skipping"
echo ""
continue
fi
fi
if ! run_ssh "$host" "echo OK" &>/dev/null; then
log_warn " Cannot SSH to $host"
echo ""
continue
fi
status=$(run_ssh "$host" "pct status $vmid 2>/dev/null | awk '{print \$2}'" 2>/dev/null || echo "missing")
if [[ "$status" != "running" ]]; then
log_warn " Container $vmid status: ${status:-empty} (host $host)"
if [[ "$DRY_RUN" != true && "$DIAGNOSE_ONLY" != true ]]; then
run_ssh "$host" "pct start $vmid" 2>/dev/null && log_ok " Started $vmid" || log_err " Failed to start $vmid"
fi
echo ""
continue
fi
# Check if port is listening (from host: curl to container IP)
code=$(run_ssh "$host" "curl -s -o /dev/null -w '%{http_code}' --connect-timeout 2 http://${ip}:${port}/ 2>/dev/null" || echo "000")
code=$(echo "$code" | tr -d '\r\n' | head -c 3)
if [[ "$code" == "000" || "$code" == "" ]]; then
# Try JSON-RPC for 8545
if [[ "$port" == "8545" ]]; then
body=$(run_ssh "$host" "curl -s -X POST -H 'Content-Type: application/json' -d '{\"jsonrpc\":\"2.0\",\"method\":\"eth_chainId\",\"params\":[],\"id\":1}' --connect-timeout 2 http://${ip}:${port}/ 2>/dev/null" || true)
if echo "$body" | grep -q "result"; then
log_ok " Port $port responds (JSON-RPC)"
else
log_warn " Port $port not responding from $host"
listening=$(run_ssh "$host" "pct exec $vmid -- ss -tlnp 2>/dev/null | head -20" 2>/dev/null || true)
[[ -n "$listening" ]] && echo " Listening in CT: $listening"
if [[ "$DIAGNOSE_ONLY" != true && "$DRY_RUN" != true ]]; then
run_ssh "$host" "pct exec $vmid -- systemctl start besu 2>/dev/null" && log_ok " Started besu in $vmid" || true
echo " (Besu may take 3060s to bind; re-run script to verify)"
fi
fi
else
log_warn " Port $port not responding (curl got $code)"
# Show what is listening inside the CT
listening=$(run_ssh "$host" "pct exec $vmid -- ss -tlnp 2>/dev/null | head -20" 2>/dev/null || true)
[[ -n "$listening" ]] && echo " Listening in CT: $listening"
if [[ "$DIAGNOSE_ONLY" != true && "$DRY_RUN" != true ]]; then
run_ssh "$host" "pct exec $vmid -- systemctl start nginx 2>/dev/null" || true
run_ssh "$host" "pct exec $vmid -- systemctl start apache2 2>/dev/null" || true
run_ssh "$host" "pct exec $vmid -- systemctl start dbis-api 2>/dev/null" || run_ssh "$host" "pct exec $vmid -- systemctl start node 2>/dev/null" || true
sleep 2
code2=$(run_ssh "$host" "curl -s -o /dev/null -w '%{http_code}' --connect-timeout 3 http://${ip}:${port}/ 2>/dev/null" || echo "000")
code2=$(echo "$code2" | tr -d '\r\n' | head -c 3)
[[ "$code2" != "000" && -n "$code2" ]] && log_ok " After start: $ip:$port responds (HTTP $code2)"
fi
fi
else
log_ok " $ip:$port responds (HTTP $code)"
fi
echo ""
done
log_ok "Done. Re-run E2E: ./scripts/verify/verify-end-to-end-routing.sh"
echo ""