Files
proxmox/scripts/check-validator-sentry-logs.sh

283 lines
11 KiB
Bash
Raw Normal View History

#!/usr/bin/env bash
# Check all Validator and Sentry node logs for errors
# Validators: VMIDs 1000-1004
# Sentries: VMIDs 1500-1503
set -euo pipefail
# Colors
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
BLUE='\033[0;34m'
NC='\033[0m' # No Color
# Proxmox host configuration
PROXMOX_HOST="${PROXMOX_HOST:-192.168.11.10}"
SSH_PASSWORD="${SSH_PASSWORD:-L@kers2010}"
# Node IP mappings
declare -A NODE_IPS=(
[1000]="192.168.11.100"
[1001]="192.168.11.101"
[1002]="192.168.11.102"
[1003]="192.168.11.103"
[1004]="192.168.11.104"
[1500]="192.168.11.150"
[1501]="192.168.11.151"
[1502]="192.168.11.152"
[1503]="192.168.11.153"
)
# Node definitions
VALIDATORS=(1000 1001 1002 1003 1004)
SENTRIES=(1500 1501 1502 1503)
LOG_LINES="${1:-100}"
# Check if sshpass is available
if ! command -v sshpass >/dev/null 2>&1; then
echo "⚠️ sshpass not installed. Attempting to install..."
sudo apt-get update -qq && sudo apt-get install -y sshpass 2>/dev/null || {
echo "❌ Cannot install sshpass automatically"
echo "Please install manually: sudo apt-get install sshpass"
exit 1
}
fi
# Error patterns to search for
ERROR_PATTERNS=(
"error"
"Error"
"ERROR"
"failed"
"Failed"
"FAILED"
"exception"
"Exception"
"EXCEPTION"
"fatal"
"Fatal"
"FATAL"
"panic"
"Panic"
"PANIC"
"Unable to read"
"file not found"
"configuration"
"restart"
"crash"
"timeout"
"Timeout"
"connection refused"
"Connection refused"
)
echo -e "${BLUE}╔══════════════════════════════════════════════════════════════╗${NC}"
echo -e "${BLUE}║ CHECKING ALL VALIDATOR AND SENTRY NODE LOGS ║${NC}"
echo -e "${BLUE}╚══════════════════════════════════════════════════════════════╝${NC}"
echo ""
echo "Checking last $LOG_LINES lines of logs for each node"
echo ""
# Function to check logs for a node
check_node_logs() {
local vmid=$1
local service_name=$2
local node_type=$3
echo -e "${BLUE}═══════════════════════════════════════════════════════════════${NC}"
echo -e "${BLUE}Checking ${node_type} VMID $vmid (service: $service_name)${NC}"
echo -e "${BLUE}═══════════════════════════════════════════════════════════════${NC}"
# Get container IP
local container_ip="${NODE_IPS[$vmid]}"
if [ -z "$container_ip" ]; then
echo -e "${RED}❌ VMID $vmid: IP address not found in mapping${NC}"
echo ""
return 1
fi
# Try to access container directly via SSH first
local logs=""
local service_status="unknown"
# Check if we can access via Proxmox host (preferred method)
if ssh -o StrictHostKeyChecking=accept-new -o ConnectTimeout=3 -i ~/.ssh/id_ed25519_proxmox "root@${PROXMOX_HOST}" "pct status $vmid 2>/dev/null" &>/dev/null; then
# Access via Proxmox host
local status_output
status_output=$(ssh -o StrictHostKeyChecking=accept-new -o ConnectTimeout=5 -i ~/.ssh/id_ed25519_proxmox "root@${PROXMOX_HOST}" \
"pct status $vmid 2>/dev/null" || echo "")
if [ -z "$status_output" ]; then
echo -e "${RED}❌ VMID $vmid: Container not found or not accessible${NC}"
echo ""
return 1
fi
local status=$(echo "$status_output" | awk '{print $2}' || echo "unknown")
if [ "$status" != "running" ]; then
echo -e "${YELLOW}⚠️ VMID $vmid: Container is not running (status: $status)${NC}"
echo ""
return 1
fi
# Check service status
service_status=$(ssh -o StrictHostKeyChecking=accept-new -o ConnectTimeout=5 -i ~/.ssh/id_ed25519_proxmox "root@${PROXMOX_HOST}" \
"pct exec $vmid -- systemctl is-active $service_name.service 2>/dev/null" || echo "inactive")
# Get recent logs
logs=$(ssh -o StrictHostKeyChecking=accept-new -o ConnectTimeout=5 -i ~/.ssh/id_ed25519_proxmox "root@${PROXMOX_HOST}" \
"pct exec $vmid -- journalctl -u $service_name.service -n $LOG_LINES --no-pager 2>/dev/null" || echo "")
else
# Fallback: Try direct SSH to container
echo -e "${YELLOW}⚠️ Cannot access via Proxmox host, trying direct SSH to container...${NC}"
# Check service status via direct SSH
service_status=$(sshpass -p "$SSH_PASSWORD" ssh -o StrictHostKeyChecking=accept-new -o ConnectTimeout=5 \
"root@${container_ip}" \
"systemctl is-active $service_name.service 2>/dev/null" || echo "inactive")
# Get recent logs via direct SSH
logs=$(sshpass -p "$SSH_PASSWORD" ssh -o StrictHostKeyChecking=accept-new -o ConnectTimeout=5 \
"root@${container_ip}" \
"journalctl -u $service_name.service -n $LOG_LINES --no-pager 2>/dev/null" || echo "")
fi
if [ "$service_status" != "active" ]; then
echo -e "${YELLOW}⚠️ Service $service_name is not active (status: $service_status)${NC}"
else
echo -e "${GREEN}✅ Service $service_name is active${NC}"
fi
# Get recent logs
echo ""
echo "Recent logs (last $LOG_LINES lines):"
echo "---"
if [ -z "$logs" ]; then
echo -e "${YELLOW}⚠️ No logs found for service $service_name${NC}"
echo ""
return 1
fi
# Display logs
echo "$logs"
echo "---"
echo ""
# Check for errors
echo "Checking for errors..."
local error_found=false
local error_count=0
for pattern in "${ERROR_PATTERNS[@]}"; do
local matches=$(echo "$logs" | grep -i "$pattern" | grep -v "restart counter" | grep -v "Scheduled restart" | grep -v "CORS Rejected" || true)
if [ -n "$matches" ]; then
local match_count=$(echo "$matches" | wc -l)
error_count=$((error_count + match_count))
if [ "$error_found" = false ]; then
error_found=true
echo -e "${RED}❌ ERRORS FOUND:${NC}"
fi
echo -e "${RED} Pattern '$pattern' found $match_count time(s):${NC}"
echo "$matches" | head -5 | sed 's/^/ /'
if [ "$match_count" -gt 5 ]; then
echo -e "${YELLOW} ... and $((match_count - 5)) more occurrence(s)${NC}"
fi
fi
done
# Check restart count
local restart_count=$(echo "$logs" | grep -i "restart counter" | tail -1 | grep -oP 'restart counter is at \K\d+' || echo "0")
if [ "$restart_count" != "0" ] && [ -n "$restart_count" ]; then
if [ "$restart_count" -gt 10 ]; then
echo -e "${RED}⚠️ High restart count: $restart_count${NC}"
error_found=true
elif [ "$restart_count" -gt 0 ]; then
echo -e "${YELLOW} Restart count: $restart_count${NC}"
fi
fi
echo ""
if [ "$error_found" = false ]; then
echo -e "${GREEN}✅ No errors found in recent logs${NC}"
return 0
else
echo -e "${RED}❌ Total error occurrences: $error_count${NC}"
return 1
fi
}
# Summary tracking
total_validators=0
total_sentries=0
validators_with_errors=0
sentries_with_errors=0
validators_checked=0
sentries_checked=0
# Check all Validator nodes
echo -e "${BLUE}╔══════════════════════════════════════════════════════════════╗${NC}"
echo -e "${BLUE}║ VALIDATOR NODES (VMIDs 1000-1004) ║${NC}"
echo -e "${BLUE}╚══════════════════════════════════════════════════════════════╝${NC}"
echo ""
for vmid in "${VALIDATORS[@]}"; do
if check_node_logs "$vmid" "besu-validator" "Validator"; then
validators_checked=$((validators_checked + 1))
else
validators_with_errors=$((validators_with_errors + 1))
validators_checked=$((validators_checked + 1))
fi
total_validators=$((total_validators + 1))
done
# Check all Sentry nodes
echo -e "${BLUE}╔══════════════════════════════════════════════════════════════╗${NC}"
echo -e "${BLUE}║ SENTRY NODES (VMIDs 1500-1503) ║${NC}"
echo -e "${BLUE}╚══════════════════════════════════════════════════════════════╝${NC}"
echo ""
for vmid in "${SENTRIES[@]}"; do
if check_node_logs "$vmid" "besu-sentry" "Sentry"; then
sentries_checked=$((sentries_checked + 1))
else
sentries_with_errors=$((sentries_with_errors + 1))
sentries_checked=$((sentries_checked + 1))
fi
total_sentries=$((total_sentries + 1))
done
# Final Summary
echo -e "${BLUE}╔══════════════════════════════════════════════════════════════╗${NC}"
echo -e "${BLUE}║ SUMMARY ║${NC}"
echo -e "${BLUE}╚══════════════════════════════════════════════════════════════╝${NC}"
echo ""
echo "Validators:"
echo " Total: $total_validators"
echo " Checked: $validators_checked"
if [ "$validators_with_errors" -eq 0 ]; then
echo -e " Errors: ${GREEN}✅ None found${NC}"
else
echo -e " Errors: ${RED}❌ Found in $validators_with_errors node(s)${NC}"
fi
echo ""
echo "Sentries:"
echo " Total: $total_sentries"
echo " Checked: $sentries_checked"
if [ "$sentries_with_errors" -eq 0 ]; then
echo -e " Errors: ${GREEN}✅ None found${NC}"
else
echo -e " Errors: ${RED}❌ Found in $sentries_with_errors node(s)${NC}"
fi
echo ""
if [ "$validators_with_errors" -eq 0 ] && [ "$sentries_with_errors" -eq 0 ]; then
echo -e "${GREEN}✅ All logs checked - No current errors found!${NC}"
exit 0
else
echo -e "${RED}❌ Errors found in some nodes. Review logs above.${NC}"
exit 1
fi