#!/usr/bin/env bash # Check all Validator and Sentry node logs for errors # Validators: VMIDs 1000-1004 # Sentries: VMIDs 1500-1503 set -euo pipefail # Colors RED='\033[0;31m' GREEN='\033[0;32m' YELLOW='\033[1;33m' BLUE='\033[0;34m' NC='\033[0m' # No Color # Proxmox host configuration PROXMOX_HOST="${PROXMOX_HOST:-192.168.11.10}" SSH_PASSWORD="${SSH_PASSWORD:-L@kers2010}" # Node IP mappings declare -A NODE_IPS=( [1000]="192.168.11.100" [1001]="192.168.11.101" [1002]="192.168.11.102" [1003]="192.168.11.103" [1004]="192.168.11.104" [1500]="192.168.11.150" [1501]="192.168.11.151" [1502]="192.168.11.152" [1503]="192.168.11.153" ) # Node definitions VALIDATORS=(1000 1001 1002 1003 1004) SENTRIES=(1500 1501 1502 1503) LOG_LINES="${1:-100}" # Check if sshpass is available if ! command -v sshpass >/dev/null 2>&1; then echo "⚠️ sshpass not installed. Attempting to install..." sudo apt-get update -qq && sudo apt-get install -y sshpass 2>/dev/null || { echo "❌ Cannot install sshpass automatically" echo "Please install manually: sudo apt-get install sshpass" exit 1 } fi # Error patterns to search for ERROR_PATTERNS=( "error" "Error" "ERROR" "failed" "Failed" "FAILED" "exception" "Exception" "EXCEPTION" "fatal" "Fatal" "FATAL" "panic" "Panic" "PANIC" "Unable to read" "file not found" "configuration" "restart" "crash" "timeout" "Timeout" "connection refused" "Connection refused" ) echo -e "${BLUE}╔══════════════════════════════════════════════════════════════╗${NC}" echo -e "${BLUE}║ CHECKING ALL VALIDATOR AND SENTRY NODE LOGS ║${NC}" echo -e "${BLUE}╚══════════════════════════════════════════════════════════════╝${NC}" echo "" echo "Checking last $LOG_LINES lines of logs for each node" echo "" # Function to check logs for a node check_node_logs() { local vmid=$1 local service_name=$2 local node_type=$3 echo -e "${BLUE}═══════════════════════════════════════════════════════════════${NC}" echo -e "${BLUE}Checking ${node_type} VMID $vmid (service: $service_name)${NC}" echo -e "${BLUE}═══════════════════════════════════════════════════════════════${NC}" # Get container IP local container_ip="${NODE_IPS[$vmid]}" if [ -z "$container_ip" ]; then echo -e "${RED}❌ VMID $vmid: IP address not found in mapping${NC}" echo "" return 1 fi # Try to access container directly via SSH first local logs="" local service_status="unknown" # Check if we can access via Proxmox host (preferred method) if ssh -o StrictHostKeyChecking=accept-new -o ConnectTimeout=3 -i ~/.ssh/id_ed25519_proxmox "root@${PROXMOX_HOST}" "pct status $vmid 2>/dev/null" &>/dev/null; then # Access via Proxmox host local status_output status_output=$(ssh -o StrictHostKeyChecking=accept-new -o ConnectTimeout=5 -i ~/.ssh/id_ed25519_proxmox "root@${PROXMOX_HOST}" \ "pct status $vmid 2>/dev/null" || echo "") if [ -z "$status_output" ]; then echo -e "${RED}❌ VMID $vmid: Container not found or not accessible${NC}" echo "" return 1 fi local status=$(echo "$status_output" | awk '{print $2}' || echo "unknown") if [ "$status" != "running" ]; then echo -e "${YELLOW}⚠️ VMID $vmid: Container is not running (status: $status)${NC}" echo "" return 1 fi # Check service status service_status=$(ssh -o StrictHostKeyChecking=accept-new -o ConnectTimeout=5 -i ~/.ssh/id_ed25519_proxmox "root@${PROXMOX_HOST}" \ "pct exec $vmid -- systemctl is-active $service_name.service 2>/dev/null" || echo "inactive") # Get recent logs logs=$(ssh -o StrictHostKeyChecking=accept-new -o ConnectTimeout=5 -i ~/.ssh/id_ed25519_proxmox "root@${PROXMOX_HOST}" \ "pct exec $vmid -- journalctl -u $service_name.service -n $LOG_LINES --no-pager 2>/dev/null" || echo "") else # Fallback: Try direct SSH to container echo -e "${YELLOW}⚠️ Cannot access via Proxmox host, trying direct SSH to container...${NC}" # Check service status via direct SSH service_status=$(sshpass -p "$SSH_PASSWORD" ssh -o StrictHostKeyChecking=accept-new -o ConnectTimeout=5 \ "root@${container_ip}" \ "systemctl is-active $service_name.service 2>/dev/null" || echo "inactive") # Get recent logs via direct SSH logs=$(sshpass -p "$SSH_PASSWORD" ssh -o StrictHostKeyChecking=accept-new -o ConnectTimeout=5 \ "root@${container_ip}" \ "journalctl -u $service_name.service -n $LOG_LINES --no-pager 2>/dev/null" || echo "") fi if [ "$service_status" != "active" ]; then echo -e "${YELLOW}⚠️ Service $service_name is not active (status: $service_status)${NC}" else echo -e "${GREEN}✅ Service $service_name is active${NC}" fi # Get recent logs echo "" echo "Recent logs (last $LOG_LINES lines):" echo "---" if [ -z "$logs" ]; then echo -e "${YELLOW}⚠️ No logs found for service $service_name${NC}" echo "" return 1 fi # Display logs echo "$logs" echo "---" echo "" # Check for errors echo "Checking for errors..." local error_found=false local error_count=0 for pattern in "${ERROR_PATTERNS[@]}"; do local matches=$(echo "$logs" | grep -i "$pattern" | grep -v "restart counter" | grep -v "Scheduled restart" | grep -v "CORS Rejected" || true) if [ -n "$matches" ]; then local match_count=$(echo "$matches" | wc -l) error_count=$((error_count + match_count)) if [ "$error_found" = false ]; then error_found=true echo -e "${RED}❌ ERRORS FOUND:${NC}" fi echo -e "${RED} Pattern '$pattern' found $match_count time(s):${NC}" echo "$matches" | head -5 | sed 's/^/ /' if [ "$match_count" -gt 5 ]; then echo -e "${YELLOW} ... and $((match_count - 5)) more occurrence(s)${NC}" fi fi done # Check restart count local restart_count=$(echo "$logs" | grep -i "restart counter" | tail -1 | grep -oP 'restart counter is at \K\d+' || echo "0") if [ "$restart_count" != "0" ] && [ -n "$restart_count" ]; then if [ "$restart_count" -gt 10 ]; then echo -e "${RED}⚠️ High restart count: $restart_count${NC}" error_found=true elif [ "$restart_count" -gt 0 ]; then echo -e "${YELLOW}ℹ️ Restart count: $restart_count${NC}" fi fi echo "" if [ "$error_found" = false ]; then echo -e "${GREEN}✅ No errors found in recent logs${NC}" return 0 else echo -e "${RED}❌ Total error occurrences: $error_count${NC}" return 1 fi } # Summary tracking total_validators=0 total_sentries=0 validators_with_errors=0 sentries_with_errors=0 validators_checked=0 sentries_checked=0 # Check all Validator nodes echo -e "${BLUE}╔══════════════════════════════════════════════════════════════╗${NC}" echo -e "${BLUE}║ VALIDATOR NODES (VMIDs 1000-1004) ║${NC}" echo -e "${BLUE}╚══════════════════════════════════════════════════════════════╝${NC}" echo "" for vmid in "${VALIDATORS[@]}"; do if check_node_logs "$vmid" "besu-validator" "Validator"; then validators_checked=$((validators_checked + 1)) else validators_with_errors=$((validators_with_errors + 1)) validators_checked=$((validators_checked + 1)) fi total_validators=$((total_validators + 1)) done # Check all Sentry nodes echo -e "${BLUE}╔══════════════════════════════════════════════════════════════╗${NC}" echo -e "${BLUE}║ SENTRY NODES (VMIDs 1500-1503) ║${NC}" echo -e "${BLUE}╚══════════════════════════════════════════════════════════════╝${NC}" echo "" for vmid in "${SENTRIES[@]}"; do if check_node_logs "$vmid" "besu-sentry" "Sentry"; then sentries_checked=$((sentries_checked + 1)) else sentries_with_errors=$((sentries_with_errors + 1)) sentries_checked=$((sentries_checked + 1)) fi total_sentries=$((total_sentries + 1)) done # Final Summary echo -e "${BLUE}╔══════════════════════════════════════════════════════════════╗${NC}" echo -e "${BLUE}║ SUMMARY ║${NC}" echo -e "${BLUE}╚══════════════════════════════════════════════════════════════╝${NC}" echo "" echo "Validators:" echo " Total: $total_validators" echo " Checked: $validators_checked" if [ "$validators_with_errors" -eq 0 ]; then echo -e " Errors: ${GREEN}✅ None found${NC}" else echo -e " Errors: ${RED}❌ Found in $validators_with_errors node(s)${NC}" fi echo "" echo "Sentries:" echo " Total: $total_sentries" echo " Checked: $sentries_checked" if [ "$sentries_with_errors" -eq 0 ]; then echo -e " Errors: ${GREEN}✅ None found${NC}" else echo -e " Errors: ${RED}❌ Found in $sentries_with_errors node(s)${NC}" fi echo "" if [ "$validators_with_errors" -eq 0 ] && [ "$sentries_with_errors" -eq 0 ]; then echo -e "${GREEN}✅ All logs checked - No current errors found!${NC}" exit 0 else echo -e "${RED}❌ Errors found in some nodes. Review logs above.${NC}" exit 1 fi