Some checks failed
Deploy to Phoenix / deploy (push) Has been cancelled
Co-authored-by: Cursor <cursoragent@cursor.com>
146 lines
7.0 KiB
Bash
Executable File
146 lines
7.0 KiB
Bash
Executable File
#!/usr/bin/env bash
|
||
# Deep dive: diagnose and fix every 502 from E2E routing.
|
||
# For each known backend (domain → IP:port), SSH to Proxmox, check container + port, fix.
|
||
#
|
||
# Usage: ./scripts/maintenance/diagnose-and-fix-502s-via-ssh.sh [--dry-run] [--diagnose-only]
|
||
# Requires: SSH to r630-01, r630-02, ml110 (key-based).
|
||
set -euo pipefail
|
||
|
||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||
PROJECT_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)"
|
||
[[ -f "${PROJECT_ROOT}/scripts/lib/load-project-env.sh" ]] && source "${PROJECT_ROOT}/scripts/lib/load-project-env.sh" 2>/dev/null || true
|
||
[[ -f "${PROJECT_ROOT}/config/ip-addresses.conf" ]] && source "${PROJECT_ROOT}/config/ip-addresses.conf" 2>/dev/null || true
|
||
# HYBX RPC 2503,2504,2505 are on ML110 per get_host_for_vmid
|
||
type get_host_for_vmid &>/dev/null && HYBX_HOST="$(get_host_for_vmid 2503)" || HYBX_HOST="$R630_01"
|
||
|
||
DRY_RUN=false
|
||
DIAGNOSE_ONLY=false
|
||
for a in "$@"; do [[ "$a" == "--dry-run" ]] && DRY_RUN=true; [[ "$a" == "--diagnose-only" ]] && DIAGNOSE_ONLY=true; done
|
||
|
||
R630_01="${PROXMOX_HOST_R630_01:-192.168.11.11}"
|
||
R630_02="${PROXMOX_HOST_R630_02:-192.168.11.12}"
|
||
ML110="${PROXMOX_HOST_ML110:-192.168.11.10}"
|
||
|
||
# Table: domain | backend_ip | backend_port | vmid | host | description
|
||
# DBIS (r630-01)
|
||
# rpc-http-prv (r630-01)
|
||
# MIM4U www (r630-02)
|
||
# Alltra/HYBX RPC (r630-01 per BESU_NODES)
|
||
# Cacti-alltra/hybx - IPs .177 and .251 (VMID TBD)
|
||
BACKENDS=(
|
||
"dbis-admin.d-bis.org|192.168.11.130|80|10130|$R630_01|dbis-frontend nginx"
|
||
"secure.d-bis.org|192.168.11.130|80|10130|$R630_01|dbis-frontend nginx"
|
||
"dbis-api.d-bis.org|192.168.11.155|3000|10150|$R630_01|dbis-api node"
|
||
"dbis-api-2.d-bis.org|192.168.11.156|3000|10151|$R630_01|dbis-api node"
|
||
"rpc-http-prv.d-bis.org|192.168.11.211|8545|2101|$R630_01|besu RPC"
|
||
"www.mim4u.org|192.168.11.37|80|7810|$R630_02|mim-web nginx"
|
||
"rpc-alltra.d-bis.org|192.168.11.172|8545|2500|$R630_01|besu alltra"
|
||
"rpc-alltra-2.d-bis.org|192.168.11.173|8545|2501|$R630_01|besu alltra"
|
||
"rpc-alltra-3.d-bis.org|192.168.11.174|8545|2502|$R630_01|besu alltra"
|
||
"rpc-hybx.d-bis.org|192.168.11.246|8545|2503|${HYBX_HOST:-$R630_01}|besu hybx"
|
||
"rpc-hybx-2.d-bis.org|192.168.11.247|8545|2504|${HYBX_HOST:-$R630_01}|besu hybx"
|
||
"rpc-hybx-3.d-bis.org|192.168.11.248|8545|2505|${HYBX_HOST:-$R630_01}|besu hybx"
|
||
"cacti-alltra.d-bis.org|192.168.11.177|80|5201|$R630_02|cacti web"
|
||
"cacti-hybx.d-bis.org|192.168.11.251|80|5202|$R630_02|cacti web"
|
||
)
|
||
|
||
run_ssh() { ssh -o ConnectTimeout=10 -o StrictHostKeyChecking=no root@"$1" "$2"; }
|
||
log_info() { echo -e "\033[0;34m[INFO]\033[0m $1"; }
|
||
log_ok() { echo -e "\033[0;32m[✓]\033[0m $1"; }
|
||
log_warn() { echo -e "\033[0;33m[⚠]\033[0m $1"; }
|
||
log_err() { echo -e "\033[0;31m[✗]\033[0m $1"; }
|
||
|
||
echo ""
|
||
echo "=== 502 deep dive: diagnose and fix each backend ==="
|
||
echo " dry-run=$DRY_RUN diagnose-only=$DIAGNOSE_ONLY"
|
||
echo ""
|
||
|
||
for line in "${BACKENDS[@]}"; do
|
||
IFS='|' read -r domain ip port vmid host desc <<< "$line"
|
||
[[ -z "$domain" ]] && continue
|
||
log_info "--- $domain → $ip:$port ($desc) ---"
|
||
if [[ -z "$host" ]]; then
|
||
log_warn " No host; will try to discover VMID by IP on r630-01/r630-02/ml110"
|
||
for h in "$R630_01" "$R630_02" "$ML110"; do
|
||
run_ssh "$h" "echo OK" &>/dev/null || continue
|
||
list=$(run_ssh "$h" "pct list 2>/dev/null | awk 'NR>1{print \$1}'" 2>/dev/null || true)
|
||
for v in $list; do
|
||
cip=$(run_ssh "$h" "pct exec $v -- hostname -I 2>/dev/null | awk '{print \$1}'" 2>/dev/null || true)
|
||
if [[ "$cip" == "$ip" ]]; then
|
||
vmid=$v
|
||
host=$h
|
||
log_info " Found VMID $vmid on $host"
|
||
break 2
|
||
fi
|
||
done
|
||
done
|
||
if [[ -z "$host" ]]; then
|
||
log_warn " Could not find container for $ip; skipping"
|
||
echo ""
|
||
continue
|
||
fi
|
||
fi
|
||
if ! run_ssh "$host" "echo OK" &>/dev/null; then
|
||
log_warn " Cannot SSH to $host"
|
||
echo ""
|
||
continue
|
||
fi
|
||
status=$(run_ssh "$host" "pct status $vmid 2>/dev/null | awk '{print \$2}'" 2>/dev/null || echo "missing")
|
||
# If HYBX (2503–2505) empty on ML110, try r630-01
|
||
if [[ -z "$status" || "$status" == "missing" ]] && [[ "$vmid" == "2503" || "$vmid" == "2504" || "$vmid" == "2505" ]] && [[ "$host" == "$ML110" ]]; then
|
||
alt_status=$(run_ssh "$R630_01" "pct status $vmid 2>/dev/null | awk '{print \$2}'" 2>/dev/null || echo "")
|
||
if [[ "$alt_status" == "running" ]]; then
|
||
host="$R630_01"
|
||
status="running"
|
||
fi
|
||
fi
|
||
if [[ "$status" != "running" ]]; then
|
||
log_warn " Container $vmid status: ${status:-empty} (host $host)"
|
||
if [[ "$DRY_RUN" != true && "$DIAGNOSE_ONLY" != true ]]; then
|
||
run_ssh "$host" "pct start $vmid" 2>/dev/null && log_ok " Started $vmid" || log_err " Failed to start $vmid"
|
||
fi
|
||
echo ""
|
||
continue
|
||
fi
|
||
# Check if port is listening (from host: curl to container IP)
|
||
code=$(run_ssh "$host" "curl -s -o /dev/null -w '%{http_code}' --connect-timeout 2 http://${ip}:${port}/ 2>/dev/null" || echo "000")
|
||
code=$(echo "$code" | tr -d '\r\n' | head -c 3)
|
||
if [[ "$code" == "000" || "$code" == "" ]]; then
|
||
# Try JSON-RPC for 8545
|
||
if [[ "$port" == "8545" ]]; then
|
||
body=$(run_ssh "$host" "curl -s -X POST -H 'Content-Type: application/json' -d '{\"jsonrpc\":\"2.0\",\"method\":\"eth_chainId\",\"params\":[],\"id\":1}' --connect-timeout 2 http://${ip}:${port}/ 2>/dev/null" || true)
|
||
if echo "$body" | grep -q "result"; then
|
||
log_ok " Port $port responds (JSON-RPC)"
|
||
else
|
||
log_warn " Port $port not responding from $host"
|
||
listening=$(run_ssh "$host" "pct exec $vmid -- ss -tlnp 2>/dev/null | head -20" 2>/dev/null || true)
|
||
[[ -n "$listening" ]] && echo " Listening in CT: $listening"
|
||
if [[ "$DIAGNOSE_ONLY" != true && "$DRY_RUN" != true ]]; then
|
||
run_ssh "$host" "pct exec $vmid -- systemctl start besu 2>/dev/null" && log_ok " Started besu in $vmid" || true
|
||
echo " (Besu may take 30–60s to bind; re-run script to verify)"
|
||
fi
|
||
fi
|
||
else
|
||
log_warn " Port $port not responding (curl got $code)"
|
||
# Show what is listening inside the CT
|
||
listening=$(run_ssh "$host" "pct exec $vmid -- ss -tlnp 2>/dev/null | head -20" 2>/dev/null || true)
|
||
[[ -n "$listening" ]] && echo " Listening in CT: $listening"
|
||
if [[ "$DIAGNOSE_ONLY" != true && "$DRY_RUN" != true ]]; then
|
||
run_ssh "$host" "pct exec $vmid -- systemctl start nginx 2>/dev/null" || true
|
||
run_ssh "$host" "pct exec $vmid -- systemctl start apache2 2>/dev/null" || true
|
||
run_ssh "$host" "pct exec $vmid -- systemctl start dbis-api 2>/dev/null" || run_ssh "$host" "pct exec $vmid -- systemctl start node 2>/dev/null" || true
|
||
sleep 2
|
||
code2=$(run_ssh "$host" "curl -s -o /dev/null -w '%{http_code}' --connect-timeout 3 http://${ip}:${port}/ 2>/dev/null" || echo "000")
|
||
code2=$(echo "$code2" | tr -d '\r\n' | head -c 3)
|
||
[[ "$code2" != "000" && -n "$code2" ]] && log_ok " After start: $ip:$port responds (HTTP $code2)"
|
||
fi
|
||
fi
|
||
else
|
||
log_ok " $ip:$port responds (HTTP $code)"
|
||
fi
|
||
echo ""
|
||
done
|
||
|
||
log_ok "Done. Re-run E2E: ./scripts/verify/verify-end-to-end-routing.sh"
|
||
echo ""
|