Files
proxmox/scripts/maintenance/diagnose-and-fix-502s-via-ssh.sh
defiQUG bea1903ac9
Some checks failed
Deploy to Phoenix / deploy (push) Has been cancelled
Sync all local changes: docs, config, scripts, submodule refs, verification evidence
Co-authored-by: Cursor <cursoragent@cursor.com>
2026-02-21 15:46:06 -08:00

146 lines
7.0 KiB
Bash
Executable File
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env bash
# Deep dive: diagnose and fix every 502 from E2E routing.
# For each known backend (domain → IP:port), SSH to Proxmox, check container + port, fix.
#
# Usage: ./scripts/maintenance/diagnose-and-fix-502s-via-ssh.sh [--dry-run] [--diagnose-only]
# Requires: SSH to r630-01, r630-02, ml110 (key-based).
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
PROJECT_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)"
[[ -f "${PROJECT_ROOT}/scripts/lib/load-project-env.sh" ]] && source "${PROJECT_ROOT}/scripts/lib/load-project-env.sh" 2>/dev/null || true
[[ -f "${PROJECT_ROOT}/config/ip-addresses.conf" ]] && source "${PROJECT_ROOT}/config/ip-addresses.conf" 2>/dev/null || true
# HYBX RPC 2503,2504,2505 are on ML110 per get_host_for_vmid
type get_host_for_vmid &>/dev/null && HYBX_HOST="$(get_host_for_vmid 2503)" || HYBX_HOST="$R630_01"
DRY_RUN=false
DIAGNOSE_ONLY=false
for a in "$@"; do [[ "$a" == "--dry-run" ]] && DRY_RUN=true; [[ "$a" == "--diagnose-only" ]] && DIAGNOSE_ONLY=true; done
R630_01="${PROXMOX_HOST_R630_01:-192.168.11.11}"
R630_02="${PROXMOX_HOST_R630_02:-192.168.11.12}"
ML110="${PROXMOX_HOST_ML110:-192.168.11.10}"
# Table: domain | backend_ip | backend_port | vmid | host | description
# DBIS (r630-01)
# rpc-http-prv (r630-01)
# MIM4U www (r630-02)
# Alltra/HYBX RPC (r630-01 per BESU_NODES)
# Cacti-alltra/hybx - IPs .177 and .251 (VMID TBD)
BACKENDS=(
"dbis-admin.d-bis.org|192.168.11.130|80|10130|$R630_01|dbis-frontend nginx"
"secure.d-bis.org|192.168.11.130|80|10130|$R630_01|dbis-frontend nginx"
"dbis-api.d-bis.org|192.168.11.155|3000|10150|$R630_01|dbis-api node"
"dbis-api-2.d-bis.org|192.168.11.156|3000|10151|$R630_01|dbis-api node"
"rpc-http-prv.d-bis.org|192.168.11.211|8545|2101|$R630_01|besu RPC"
"www.mim4u.org|192.168.11.37|80|7810|$R630_02|mim-web nginx"
"rpc-alltra.d-bis.org|192.168.11.172|8545|2500|$R630_01|besu alltra"
"rpc-alltra-2.d-bis.org|192.168.11.173|8545|2501|$R630_01|besu alltra"
"rpc-alltra-3.d-bis.org|192.168.11.174|8545|2502|$R630_01|besu alltra"
"rpc-hybx.d-bis.org|192.168.11.246|8545|2503|${HYBX_HOST:-$R630_01}|besu hybx"
"rpc-hybx-2.d-bis.org|192.168.11.247|8545|2504|${HYBX_HOST:-$R630_01}|besu hybx"
"rpc-hybx-3.d-bis.org|192.168.11.248|8545|2505|${HYBX_HOST:-$R630_01}|besu hybx"
"cacti-alltra.d-bis.org|192.168.11.177|80|5201|$R630_02|cacti web"
"cacti-hybx.d-bis.org|192.168.11.251|80|5202|$R630_02|cacti web"
)
run_ssh() { ssh -o ConnectTimeout=10 -o StrictHostKeyChecking=no root@"$1" "$2"; }
log_info() { echo -e "\033[0;34m[INFO]\033[0m $1"; }
log_ok() { echo -e "\033[0;32m[✓]\033[0m $1"; }
log_warn() { echo -e "\033[0;33m[⚠]\033[0m $1"; }
log_err() { echo -e "\033[0;31m[✗]\033[0m $1"; }
echo ""
echo "=== 502 deep dive: diagnose and fix each backend ==="
echo " dry-run=$DRY_RUN diagnose-only=$DIAGNOSE_ONLY"
echo ""
for line in "${BACKENDS[@]}"; do
IFS='|' read -r domain ip port vmid host desc <<< "$line"
[[ -z "$domain" ]] && continue
log_info "--- $domain$ip:$port ($desc) ---"
if [[ -z "$host" ]]; then
log_warn " No host; will try to discover VMID by IP on r630-01/r630-02/ml110"
for h in "$R630_01" "$R630_02" "$ML110"; do
run_ssh "$h" "echo OK" &>/dev/null || continue
list=$(run_ssh "$h" "pct list 2>/dev/null | awk 'NR>1{print \$1}'" 2>/dev/null || true)
for v in $list; do
cip=$(run_ssh "$h" "pct exec $v -- hostname -I 2>/dev/null | awk '{print \$1}'" 2>/dev/null || true)
if [[ "$cip" == "$ip" ]]; then
vmid=$v
host=$h
log_info " Found VMID $vmid on $host"
break 2
fi
done
done
if [[ -z "$host" ]]; then
log_warn " Could not find container for $ip; skipping"
echo ""
continue
fi
fi
if ! run_ssh "$host" "echo OK" &>/dev/null; then
log_warn " Cannot SSH to $host"
echo ""
continue
fi
status=$(run_ssh "$host" "pct status $vmid 2>/dev/null | awk '{print \$2}'" 2>/dev/null || echo "missing")
# If HYBX (25032505) empty on ML110, try r630-01
if [[ -z "$status" || "$status" == "missing" ]] && [[ "$vmid" == "2503" || "$vmid" == "2504" || "$vmid" == "2505" ]] && [[ "$host" == "$ML110" ]]; then
alt_status=$(run_ssh "$R630_01" "pct status $vmid 2>/dev/null | awk '{print \$2}'" 2>/dev/null || echo "")
if [[ "$alt_status" == "running" ]]; then
host="$R630_01"
status="running"
fi
fi
if [[ "$status" != "running" ]]; then
log_warn " Container $vmid status: ${status:-empty} (host $host)"
if [[ "$DRY_RUN" != true && "$DIAGNOSE_ONLY" != true ]]; then
run_ssh "$host" "pct start $vmid" 2>/dev/null && log_ok " Started $vmid" || log_err " Failed to start $vmid"
fi
echo ""
continue
fi
# Check if port is listening (from host: curl to container IP)
code=$(run_ssh "$host" "curl -s -o /dev/null -w '%{http_code}' --connect-timeout 2 http://${ip}:${port}/ 2>/dev/null" || echo "000")
code=$(echo "$code" | tr -d '\r\n' | head -c 3)
if [[ "$code" == "000" || "$code" == "" ]]; then
# Try JSON-RPC for 8545
if [[ "$port" == "8545" ]]; then
body=$(run_ssh "$host" "curl -s -X POST -H 'Content-Type: application/json' -d '{\"jsonrpc\":\"2.0\",\"method\":\"eth_chainId\",\"params\":[],\"id\":1}' --connect-timeout 2 http://${ip}:${port}/ 2>/dev/null" || true)
if echo "$body" | grep -q "result"; then
log_ok " Port $port responds (JSON-RPC)"
else
log_warn " Port $port not responding from $host"
listening=$(run_ssh "$host" "pct exec $vmid -- ss -tlnp 2>/dev/null | head -20" 2>/dev/null || true)
[[ -n "$listening" ]] && echo " Listening in CT: $listening"
if [[ "$DIAGNOSE_ONLY" != true && "$DRY_RUN" != true ]]; then
run_ssh "$host" "pct exec $vmid -- systemctl start besu 2>/dev/null" && log_ok " Started besu in $vmid" || true
echo " (Besu may take 3060s to bind; re-run script to verify)"
fi
fi
else
log_warn " Port $port not responding (curl got $code)"
# Show what is listening inside the CT
listening=$(run_ssh "$host" "pct exec $vmid -- ss -tlnp 2>/dev/null | head -20" 2>/dev/null || true)
[[ -n "$listening" ]] && echo " Listening in CT: $listening"
if [[ "$DIAGNOSE_ONLY" != true && "$DRY_RUN" != true ]]; then
run_ssh "$host" "pct exec $vmid -- systemctl start nginx 2>/dev/null" || true
run_ssh "$host" "pct exec $vmid -- systemctl start apache2 2>/dev/null" || true
run_ssh "$host" "pct exec $vmid -- systemctl start dbis-api 2>/dev/null" || run_ssh "$host" "pct exec $vmid -- systemctl start node 2>/dev/null" || true
sleep 2
code2=$(run_ssh "$host" "curl -s -o /dev/null -w '%{http_code}' --connect-timeout 3 http://${ip}:${port}/ 2>/dev/null" || echo "000")
code2=$(echo "$code2" | tr -d '\r\n' | head -c 3)
[[ "$code2" != "000" && -n "$code2" ]] && log_ok " After start: $ip:$port responds (HTTP $code2)"
fi
fi
else
log_ok " $ip:$port responds (HTTP $code)"
fi
echo ""
done
log_ok "Done. Re-run E2E: ./scripts/verify/verify-end-to-end-routing.sh"
echo ""