#!/usr/bin/env bash # Diagnose and Fix Storage Issues for Proxmox Container Migrations # This script checks storage configuration and fixes issues to enable migrations set -euo pipefail # Load IP configuration SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" PROJECT_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)" source "${PROJECT_ROOT}/config/ip-addresses.conf" 2>/dev/null || true # Configuration PROXMOX_HOST_ML110="${PROXMOX_HOST_ML110}" PROXMOX_HOST_PVE="${PROXMOX_HOST_R630_01}" PROXMOX_HOST_PVE2="${PROXMOX_HOST_R630_02}" ML110_PASS="L@kers2010" PVE_PASS="password" PVE2_PASS="password" # Containers to migrate CONTAINERS=(1504 2503 2504 6201) TARGET_NODE="pve" # Colors RED='\033[0;31m' GREEN='\033[0;32m' YELLOW='\033[1;33m' BLUE='\033[0;34m' CYAN='\033[0;36m' NC='\033[0m' log_info() { echo -e "${BLUE}[INFO]${NC} $1"; } log_success() { echo -e "${GREEN}[✓]${NC} $1"; } log_warn() { echo -e "${YELLOW}[WARN]${NC} $1"; } log_error() { echo -e "${RED}[ERROR]${NC} $1"; } log_header() { echo -e "${CYAN}━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━${NC}"; } # SSH helper with password ssh_node() { local host=$1 local pass=$2 shift 2 sshpass -p "$pass" ssh -o ConnectTimeout=5 -o StrictHostKeyChecking=accept-new root@"$host" "$@" 2>&1 } # Check if node is reachable check_node_connectivity() { local host=$1 local pass=$2 local node_name=$3 log_info "Checking connectivity to $node_name ($host)..." if sshpass -p "$pass" ssh -o ConnectTimeout=5 -o StrictHostKeyChecking=accept-new root@"$host" "echo 'connected'" 2>/dev/null; then log_success "$node_name is reachable" return 0 else log_error "$node_name is not reachable" return 1 fi } # Get storage status on a node get_storage_status() { local host=$1 local pass=$2 local node_name=$3 log_info "Checking storage status on $node_name..." echo "" ssh_node "$host" "$pass" "pvesm status" || { log_error "Failed to get storage status from $node_name" return 1 } echo "" } # Get volume groups on a node get_volume_groups() { local host=$1 local pass=$2 local node_name=$3 log_info "Checking volume groups on $node_name..." echo "" ssh_node "$host" "$pass" "vgs" || { log_warn "Failed to get volume groups from $node_name (may not have LVM)" return 1 } echo "" } # Get storage configuration get_storage_config() { local host=$1 local pass=$2 local node_name=$3 log_info "Checking storage configuration on $node_name..." echo "" ssh_node "$host" "$pass" "cat /etc/pve/storage.cfg 2>/dev/null || echo 'No storage.cfg found'" || true echo "" } # Check where a container is located find_container_location() { local vmid=$1 log_info "Finding location of container $vmid..." # Check on ml110 if ssh_node "$PROXMOX_HOST_ML110" "$ML110_PASS" "pvesh get /nodes/ml110/lxc/$vmid/status/current 2>/dev/null | jq -r '.status' 2>/dev/null" | grep -q "running\|stopped"; then echo "ml110" return 0 fi # Check on pve if ssh_node "$PROXMOX_HOST_PVE" "$PVE_PASS" "pvesh get /nodes/pve/lxc/$vmid/status/current 2>/dev/null | jq -r '.status' 2>/dev/null" | grep -q "running\|stopped"; then echo "pve" return 0 fi # Check on pve2 if ssh_node "$PROXMOX_HOST_PVE2" "$PVE2_PASS" "pvesh get /nodes/pve2/lxc/$vmid/status/current 2>/dev/null | jq -r '.status' 2>/dev/null" | grep -q "running\|stopped"; then echo "pve2" return 0 fi echo "not_found" return 1 } # Get container storage info get_container_storage() { local vmid=$1 local node=$2 local host=$3 local pass=$4 log_info "Getting storage info for container $vmid on $node..." # Get rootfs storage local rootfs=$(ssh_node "$host" "$pass" "pct config $vmid 2>/dev/null | grep '^rootfs:' | awk '{print \$2}' | cut -d: -f1" || echo "unknown") echo "$rootfs" } # Check if storage exists and is active on target node check_target_storage() { local target_node=$1 local storage_name=$2 local host="" local pass="" case "$target_node" in pve) host="$PROXMOX_HOST_PVE" pass="$PVE_PASS" ;; pve2) host="$PROXMOX_HOST_PVE2" pass="$PVE2_PASS" ;; *) log_error "Unknown target node: $target_node" return 1 ;; esac log_info "Checking if storage '$storage_name' exists and is active on $target_node..." local status=$(ssh_node "$host" "$pass" "pvesm status 2>/dev/null | grep '^$storage_name' | awk '{print \$3}'" || echo "") if [ -z "$status" ]; then log_error "Storage '$storage_name' not found on $target_node" return 1 fi if echo "$status" | grep -qi "active\|enabled"; then log_success "Storage '$storage_name' is active on $target_node" return 0 else log_warn "Storage '$storage_name' exists but is not active (status: $status)" return 1 fi } # Fix storage configuration on target node fix_target_storage() { local target_node=$1 local preferred_storage=$2 local host="" local pass="" case "$target_node" in pve) host="$PROXMOX_HOST_PVE" pass="$PVE_PASS" ;; pve2) host="$PROXMOX_HOST_PVE2" pass="$PVE2_PASS" ;; *) log_error "Unknown target node: $target_node" return 1 ;; esac log_info "Attempting to fix storage configuration on $target_node..." # Check available storage local available_storage=$(ssh_node "$host" "$pass" "pvesm status 2>/dev/null | grep -E '(thin1|local|local-lvm)' | head -1 | awk '{print \$1}'" || echo "") if [ -z "$available_storage" ]; then log_error "No suitable storage found on $target_node" log_info "Available storage:" ssh_node "$host" "$pass" "pvesm status" || true return 1 fi log_success "Found available storage: $available_storage on $target_node" echo "$available_storage" } # Diagnose all nodes diagnose_all_nodes() { log_header log_info "DIAGNOSTIC PHASE - Checking All Nodes" log_header echo "" # Check ml110 if check_node_connectivity "$PROXMOX_HOST_ML110" "$ML110_PASS" "ml110"; then get_storage_status "$PROXMOX_HOST_ML110" "$ML110_PASS" "ml110" get_volume_groups "$PROXMOX_HOST_ML110" "$ML110_PASS" "ml110" fi # Check pve if check_node_connectivity "$PROXMOX_HOST_PVE" "$PVE_PASS" "pve"; then get_storage_status "$PROXMOX_HOST_PVE" "$PVE_PASS" "pve" get_volume_groups "$PROXMOX_HOST_PVE" "$PVE_PASS" "pve" fi # Check pve2 if check_node_connectivity "$PROXMOX_HOST_PVE2" "$PVE2_PASS" "pve2"; then get_storage_status "$PROXMOX_HOST_PVE2" "$PVE2_PASS" "pve2" get_volume_groups "$PROXMOX_HOST_PVE2" "$PVE2_PASS" "pve2" fi } # Diagnose container locations and storage diagnose_containers() { log_header log_info "CONTAINER DIAGNOSTIC PHASE" log_header echo "" for vmid in "${CONTAINERS[@]}"; do log_info "Container $vmid:" # Find location local location=$(find_container_location "$vmid") log_info " Location: $location" if [ "$location" != "not_found" ]; then # Get storage local host="" local pass="" case "$location" in ml110) host="$PROXMOX_HOST_ML110" pass="$ML110_PASS" ;; pve) host="$PROXMOX_HOST_PVE" pass="$PVE_PASS" ;; pve2) host="$PROXMOX_HOST_PVE2" pass="$PVE2_PASS" ;; esac local storage=$(get_container_storage "$vmid" "$location" "$host" "$pass") log_info " Current storage: $storage" # Get status local status=$(ssh_node "$host" "$pass" "pvesh get /nodes/$location/lxc/$vmid/status/current 2>/dev/null | jq -r '.status' 2>/dev/null" || echo "unknown") log_info " Status: $status" else log_warn " Container $vmid not found on any node" fi echo "" done } # Fix storage and attempt migration fix_and_migrate() { log_header log_info "FIX AND MIGRATION PHASE" log_header echo "" # Determine target storage log_info "Determining target storage on $TARGET_NODE..." local target_storage=$(fix_target_storage "$TARGET_NODE" "thin1") if [ -z "$target_storage" ]; then log_error "Cannot determine target storage. Aborting migration." return 1 fi log_success "Using target storage: $target_storage on $TARGET_NODE" echo "" # Migrate each container local failed=0 for vmid in "${CONTAINERS[@]}"; do log_info "Processing container $vmid..." # Find current location local location=$(find_container_location "$vmid") if [ "$location" == "not_found" ]; then log_warn "Container $vmid not found, skipping..." continue fi if [ "$location" == "$TARGET_NODE" ]; then log_success "Container $vmid is already on $TARGET_NODE" continue fi # Get source host and pass local source_host="" local source_pass="" case "$location" in ml110) source_host="$PROXMOX_HOST_ML110" source_pass="$ML110_PASS" ;; pve) source_host="$PROXMOX_HOST_PVE" source_pass="$PVE_PASS" ;; pve2) source_host="$PROXMOX_HOST_PVE2" source_pass="$PVE2_PASS" ;; esac # Stop container if running log_info " Checking container status..." local status=$(ssh_node "$source_host" "$source_pass" "pvesh get /nodes/$location/lxc/$vmid/status/current 2>/dev/null | jq -r '.status' 2>/dev/null" || echo "stopped") if [ "$status" == "running" ]; then log_info " Stopping container $vmid..." ssh_node "$source_host" "$source_pass" "pct stop $vmid" || { log_warn " Failed to stop container, trying shutdown..." ssh_node "$source_host" "$source_pass" "pvesh create /nodes/$location/lxc/$vmid/status/shutdown --timeout 30" || true } sleep 5 fi # Attempt migration log_info " Migrating container $vmid from $location to $TARGET_NODE..." log_info " Target storage: $target_storage" # Try migration with storage specification first local migrate_output=$(ssh_node "$source_host" "$source_pass" \ "pvesh create /nodes/$location/lxc/$vmid/migrate --target $TARGET_NODE --storage $target_storage --online 0" 2>&1) local migrate_exit=$? if [ $migrate_exit -ne 0 ]; then log_warn " Migration with storage specification failed, trying without storage..." # Try without storage (Proxmox will use default) migrate_output=$(ssh_node "$source_host" "$source_pass" \ "pvesh create /nodes/$location/lxc/$vmid/migrate --target $TARGET_NODE --online 0" 2>&1) migrate_exit=$? fi if [ $migrate_exit -eq 0 ]; then log_success " Migration command completed for container $vmid" # Wait and verify log_info " Waiting for migration to complete..." local migrated=false for i in {1..12}; do sleep 5 local new_location=$(find_container_location "$vmid") if [ "$new_location" == "$TARGET_NODE" ]; then log_success " Container $vmid is now on $TARGET_NODE" migrated=true break fi if [ $i -lt 12 ]; then log_info " Still migrating... (attempt $i/12)" fi done if [ "$migrated" == "false" ]; then log_warn " Migration may still be in progress or failed" log_info " Please verify manually: ssh root@$source_host 'pvesh get /nodes/$TARGET_NODE/lxc'" failed=$((failed + 1)) fi else log_error " Migration failed for container $vmid" log_info " Error output: $migrate_output" failed=$((failed + 1)) fi echo "" done if [ $failed -eq 0 ]; then log_success "All containers migrated successfully!" return 0 else log_warn "$failed container(s) failed to migrate" return 1 fi } # Main execution main() { echo "" log_header log_info "Proxmox Storage Migration Diagnostic and Fix Tool" log_header echo "" log_info "This script will:" log_info " 1. Diagnose storage configuration on all nodes" log_info " 2. Check container locations and storage" log_info " 3. Fix storage issues if needed" log_info " 4. Attempt to migrate containers: ${CONTAINERS[*]}" log_info " 5. Target node: $TARGET_NODE" echo "" # Check for non-interactive mode if [[ "${NON_INTERACTIVE:-}" == "1" ]] || [[ ! -t 0 ]]; then log_info "Non-interactive mode: proceeding automatically" else read -p "Continue? (y/N): " -n 1 -r echo "" if [[ ! $REPLY =~ ^[Yy]$ ]]; then log_info "Operation cancelled" exit 0 fi fi echo "" # Phase 1: Diagnose diagnose_all_nodes echo "" diagnose_containers echo "" # Phase 2: Fix and migrate if fix_and_migrate; then log_success "Migration process completed successfully!" else log_warn "Migration process completed with some failures" log_info "Review the output above for details" fi echo "" log_header log_info "Final Container Locations" log_header echo "" for vmid in "${CONTAINERS[@]}"; do local location=$(find_container_location "$vmid") if [ "$location" != "not_found" ]; then log_success "Container $vmid: $location" else log_warn "Container $vmid: not found" fi done echo "" } main "$@"