Sankofa/infrastructure/proxmox/scripts/cluster-health.sh

#!/bin/bash
set -euo pipefail

# Proxmox Cluster Health Check Script

SITE="${SITE:-}"
NODE="${NODE:-}"

log() {
    echo "[$(date +'%Y-%m-%d %H:%M:%S')] $*" >&2
}

error() {
    log "ERROR: $*"
    exit 1
}

check_node() {
    local node=$1
    log "Checking node: ${node}..."

    if ! command -v pvesh &> /dev/null; then
        error "pvesh not found. This script must be run on a Proxmox node."
    fi

    # Check node status
    STATUS=$(pvesh get /nodes/${node}/status --output-format json 2>/dev/null || echo "{}")

    if [ -z "${STATUS}" ] || [ "${STATUS}" = "{}" ]; then
        log "  ❌ Node ${node} is unreachable"
        return 1
    fi

    # Parse status
    UPTIME=$(echo "${STATUS}" | grep -o '"uptime":[0-9]*' | cut -d':' -f2)
    CPU=$(echo "${STATUS}" | grep -o '"cpu":[0-9.]*' | cut -d':' -f2)
    MEMORY_TOTAL=$(echo "${STATUS}" | grep -o '"memory_total":[0-9]*' | cut -d':' -f2)
    MEMORY_USED=$(echo "${STATUS}" | grep -o '"memory_used":[0-9]*' | cut -d':' -f2)

    if [ -n "${UPTIME}" ]; then
        log "  ✅ Node ${node} is online"
        log "     Uptime: ${UPTIME} seconds"
        log "     CPU: ${CPU}%"
        if [ -n "${MEMORY_TOTAL}" ] && [ -n "${MEMORY_USED}" ]; then
            MEMORY_PERCENT=$((MEMORY_USED * 100 / MEMORY_TOTAL))
            log "     Memory: ${MEMORY_PERCENT}% used (${MEMORY_USED}/${MEMORY_TOTAL} bytes)"
        fi
        return 0
    else
        log "  ❌ Node ${node} status unknown"
        return 1
    fi
}

check_cluster() {
    log "Checking cluster status..."

    # Get cluster nodes
    NODES=$(pvesh get /nodes --output-format json 2>/dev/null | grep -o '"node":"[^"]*' | cut -d'"' -f4 || echo "")

    if [ -z "${NODES}" ]; then
        error "Cannot retrieve cluster nodes"
    fi

    log "Found nodes: ${NODES}"

    local all_healthy=true
    for node in ${NODES}; do
        if ! check_node "${node}"; then
            all_healthy=false
        fi
    done

    if [ "${all_healthy}" = "true" ]; then
        log "✅ All nodes are healthy"
        return 0
    else
        log "❌ Some nodes are unhealthy"
        return 1
    fi
}

check_storage() {
    log "Checking storage pools..."

    STORAGE=$(pvesh get /storage --output-format json 2>/dev/null || echo "[]")

    if [ -z "${STORAGE}" ] || [ "${STORAGE}" = "[]" ]; then
        log "  ⚠️  No storage pools found"
        return 0
    fi

    # Parse storage (simplified)
    log "  Storage pools configured"
    return 0
}

check_vms() {
    log "Checking virtual machines..."

    # Get all VMs
    VMS=$(pvesh get /nodes --output-format json 2>/dev/null | grep -o '"vmid":[0-9]*' | cut -d':' -f2 | sort -u || echo "")

    if [ -z "${VMS}" ]; then
        log "  No VMs found"
        return 0
    fi

    VM_COUNT=$(echo "${VMS}" | wc -l)
    log "  Found ${VM_COUNT} virtual machines"

    return 0
}

main() {
    log "Starting Proxmox cluster health check..."

    if [ -n "${NODE}" ]; then
        check_node "${NODE}"
    elif [ -n "${SITE}" ]; then
        log "Checking site: ${SITE}"
        check_cluster
        check_storage
        check_vms
    else
        check_cluster
        check_storage
        check_vms
    fi

    log "Health check completed!"
}

main "$@"