Files
Sankofa/infrastructure/proxmox/scripts/cluster-health.sh
defiQUG 9daf1fd378 Apply Composer changes: comprehensive API updates, migrations, middleware, and infrastructure improvements
- Add comprehensive database migrations (001-024) for schema evolution
- Enhance API schema with expanded type definitions and resolvers
- Add new middleware: audit logging, rate limiting, MFA enforcement, security, tenant auth
- Implement new services: AI optimization, billing, blockchain, compliance, marketplace
- Add adapter layer for cloud integrations (Cloudflare, Kubernetes, Proxmox, storage)
- Update Crossplane provider with enhanced VM management capabilities
- Add comprehensive test suite for API endpoints and services
- Update frontend components with improved GraphQL subscriptions and real-time updates
- Enhance security configurations and headers (CSP, CORS, etc.)
- Update documentation and configuration files
- Add new CI/CD workflows and validation scripts
- Implement design system improvements and UI enhancements
2025-12-12 18:01:35 -08:00

136 lines
3.2 KiB
Bash
Executable File

#!/bin/bash
set -euo pipefail
# Proxmox Cluster Health Check Script
SITE="${SITE:-}"
NODE="${NODE:-}"
log() {
echo "[$(date +'%Y-%m-%d %H:%M:%S')] $*" >&2
}
error() {
log "ERROR: $*"
exit 1
}
check_node() {
local node=$1
log "Checking node: ${node}..."
if ! command -v pvesh &> /dev/null; then
error "pvesh not found. This script must be run on a Proxmox node."
fi
# Check node status
STATUS=$(pvesh get /nodes/${node}/status --output-format json 2>/dev/null || echo "{}")
if [ -z "${STATUS}" ] || [ "${STATUS}" = "{}" ]; then
log " ❌ Node ${node} is unreachable"
return 1
fi
# Parse status
UPTIME=$(echo "${STATUS}" | grep -o '"uptime":[0-9]*' | cut -d':' -f2)
CPU=$(echo "${STATUS}" | grep -o '"cpu":[0-9.]*' | cut -d':' -f2)
MEMORY_TOTAL=$(echo "${STATUS}" | grep -o '"memory_total":[0-9]*' | cut -d':' -f2)
MEMORY_USED=$(echo "${STATUS}" | grep -o '"memory_used":[0-9]*' | cut -d':' -f2)
if [ -n "${UPTIME}" ]; then
log " ✅ Node ${node} is online"
log " Uptime: ${UPTIME} seconds"
log " CPU: ${CPU}%"
if [ -n "${MEMORY_TOTAL}" ] && [ -n "${MEMORY_USED}" ]; then
MEMORY_PERCENT=$((MEMORY_USED * 100 / MEMORY_TOTAL))
log " Memory: ${MEMORY_PERCENT}% used (${MEMORY_USED}/${MEMORY_TOTAL} bytes)"
fi
return 0
else
log " ❌ Node ${node} status unknown"
return 1
fi
}
check_cluster() {
log "Checking cluster status..."
# Get cluster nodes
NODES=$(pvesh get /nodes --output-format json 2>/dev/null | grep -o '"node":"[^"]*' | cut -d'"' -f4 || echo "")
if [ -z "${NODES}" ]; then
error "Cannot retrieve cluster nodes"
fi
log "Found nodes: ${NODES}"
local all_healthy=true
for node in ${NODES}; do
if ! check_node "${node}"; then
all_healthy=false
fi
done
if [ "${all_healthy}" = "true" ]; then
log "✅ All nodes are healthy"
return 0
else
log "❌ Some nodes are unhealthy"
return 1
fi
}
check_storage() {
log "Checking storage pools..."
STORAGE=$(pvesh get /storage --output-format json 2>/dev/null || echo "[]")
if [ -z "${STORAGE}" ] || [ "${STORAGE}" = "[]" ]; then
log " ⚠️ No storage pools found"
return 0
fi
# Parse storage (simplified)
log " Storage pools configured"
return 0
}
check_vms() {
log "Checking virtual machines..."
# Get all VMs
VMS=$(pvesh get /nodes --output-format json 2>/dev/null | grep -o '"vmid":[0-9]*' | cut -d':' -f2 | sort -u || echo "")
if [ -z "${VMS}" ]; then
log " No VMs found"
return 0
fi
VM_COUNT=$(echo "${VMS}" | wc -l)
log " Found ${VM_COUNT} virtual machines"
return 0
}
main() {
log "Starting Proxmox cluster health check..."
if [ -n "${NODE}" ]; then
check_node "${NODE}"
elif [ -n "${SITE}" ]; then
log "Checking site: ${SITE}"
check_cluster
check_storage
check_vms
else
check_cluster
check_storage
check_vms
fi
log "Health check completed!"
}
main "$@"