proxmox/scripts/verify/poll-lxc-cluster-health.sh

#!/usr/bin/env bash
# Poll Proxmox LXC cluster health over SSH with key auth.
# Collects:
#   - /cluster/resources VM inventory (LXCs only) from a seed Proxmox host
#   - Per-node load, RAM, PSI, /var/lib/vz usage, and pvesm status
# Emits:
#   - Timestamped JSON report under reports/status/
#   - Human-readable summary text next to the JSON
# Exit codes:
#   0 = OK
#   1 = WARN findings present
#   2 = CRIT findings present
#   3 = Collection failure / seed unreachable
#
# Usage:
#   bash scripts/verify/poll-lxc-cluster-health.sh
#   SEED_HOST=192.168.11.11 bash scripts/verify/poll-lxc-cluster-health.sh --json
#   OUT_DIR=/tmp bash scripts/verify/poll-lxc-cluster-health.sh
#
# Threshold env overrides:
#   CLUSTER_HEALTH_NODE_LOAD_WARN_PER_CPU=0.90
#   CLUSTER_HEALTH_NODE_LOAD_CRIT_PER_CPU=1.20
#   CLUSTER_HEALTH_NODE_MEM_WARN_PCT=85
#   CLUSTER_HEALTH_NODE_MEM_CRIT_PCT=92
#   CLUSTER_HEALTH_VZ_WARN_PCT=85
#   CLUSTER_HEALTH_VZ_CRIT_PCT=93
#   CLUSTER_HEALTH_PSI_CPU_SOME_WARN=10
#   CLUSTER_HEALTH_PSI_CPU_SOME_CRIT=20
#   CLUSTER_HEALTH_PSI_IO_FULL_WARN=10
#   CLUSTER_HEALTH_PSI_IO_FULL_CRIT=20
#   CLUSTER_HEALTH_PSI_MEM_FULL_WARN=5
#   CLUSTER_HEALTH_PSI_MEM_FULL_CRIT=10
#   CLUSTER_HEALTH_LXC_MEM_WARN_PCT=85
#   CLUSTER_HEALTH_LXC_MEM_CRIT_PCT=95
#   CLUSTER_HEALTH_LXC_CPU_WARN_PCT=20
#   CLUSTER_HEALTH_LXC_CPU_CRIT_PCT=40
#   CLUSTER_HEALTH_NODE_SKEW_WARN_PCT=45
#   CLUSTER_HEALTH_NODE_SKEW_CRIT_PCT=55

set -euo pipefail

SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
PROJECT_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)"
# shellcheck source=/dev/null
source "${PROJECT_ROOT}/scripts/lib/load-project-env.sh"

JSON_ONLY=0
case "${1:-}" in
  --json) JSON_ONLY=1 ;;
  "" ) ;;
  -h|--help)
    sed -n '1,48p' "$0"
    exit 0
    ;;
  *)
    echo "ERROR: unknown argument: ${1}" >&2
    exit 2
    ;;
esac

PROXMOX_SSH_USER="${PROXMOX_SSH_USER:-root}"
SEED_HOST="${SEED_HOST:-${PROXMOX_HOST_R630_01:-192.168.11.11}}"
OUT_DIR="${OUT_DIR:-${PROJECT_ROOT}/reports/status}"
TS="$(date +%Y%m%d_%H%M%S)"
JSON_OUT="${JSON_OUT:-${OUT_DIR}/lxc_cluster_health_${TS}.json}"
TEXT_OUT="${TEXT_OUT:-${OUT_DIR}/lxc_cluster_health_${TS}.txt}"
mkdir -p "${OUT_DIR}"

TMP_DIR="$(mktemp -d)"
trap 'rm -rf "${TMP_DIR}"' EXIT

VM_JSON="${TMP_DIR}/cluster_resources_vm.json"
NODES_JSON="${TMP_DIR}/cluster_resources_node.json"
NODE_DIR="${TMP_DIR}/nodes"
mkdir -p "${NODE_DIR}"

ssh_base=(
  ssh
  -o BatchMode=yes
  -o ConnectTimeout=15
  -o StrictHostKeyChecking=no
)

SEED_TARGET="${PROXMOX_SSH_USER}@${SEED_HOST}"

node_ssh_host() {
  case "$1" in
    ml110) printf '%s\n' "${PROXMOX_HOST_ML110:-$1}" ;;
    r630-01) printf '%s\n' "${PROXMOX_HOST_R630_01:-$1}" ;;
    r630-02) printf '%s\n' "${PROXMOX_HOST_R630_02:-$1}" ;;
    r630-03) printf '%s\n' "${PROXMOX_HOST_R630_03:-$1}" ;;
    r630-04) printf '%s\n' "${PROXMOX_HOST_R630_04:-$1}" ;;
    *) printf '%s\n' "$1" ;;
  esac
}

if ! ping -c1 -W2 "${SEED_HOST}" >/dev/null 2>&1; then
  echo "ERROR: seed unreachable: ${SEED_HOST}" >&2
  exit 3
fi

if ! "${ssh_base[@]}" "${SEED_TARGET}" "pvesh get /cluster/resources --type vm --output-format json" >"${VM_JSON}" 2>"${TMP_DIR}/seed_vm.err"; then
  echo "ERROR: failed to query VM resources from ${SEED_HOST}" >&2
  cat "${TMP_DIR}/seed_vm.err" >&2 || true
  exit 3
fi

if ! "${ssh_base[@]}" "${SEED_TARGET}" "pvesh get /cluster/resources --type node --output-format json" >"${NODES_JSON}" 2>"${TMP_DIR}/seed_nodes.err"; then
  echo "ERROR: failed to query node resources from ${SEED_HOST}" >&2
  cat "${TMP_DIR}/seed_nodes.err" >&2 || true
  exit 3
fi

mapfile -t NODE_ROWS < <(
  python3 - "${NODES_JSON}" <<'PY'
import json, sys
with open(sys.argv[1], 'r', encoding='utf-8') as fh:
    data = json.load(fh)
for row in data:
    node = row.get("node")
    if node:
        print(node)
PY
)

if [[ "${#NODE_ROWS[@]}" -eq 0 ]]; then
  echo "ERROR: no Proxmox nodes returned by cluster resources" >&2
  exit 3
fi

REMOTE_BODY=$(cat <<'EOS'
set -euo pipefail
echo "__HOSTNAME__"
hostname -s 2>/dev/null || hostname
echo "__UPTIME__"
uptime
echo "__NPROC__"
nproc 2>/dev/null || getconf _NPROCESSORS_ONLN || echo 0
echo "__FREE__"
free -b
echo "__PSI_CPU__"
cat /proc/pressure/cpu 2>/dev/null || true
echo "__PSI_IO__"
cat /proc/pressure/io 2>/dev/null || true
echo "__PSI_MEMORY__"
cat /proc/pressure/memory 2>/dev/null || true
echo "__DF_VZ__"
df -B1 -P /var/lib/vz 2>/dev/null || true
echo "__PVESM__"
pvesm status 2>/dev/null || true
EOS
)

for node in "${NODE_ROWS[@]}"; do
  target="${PROXMOX_SSH_USER}@$(node_ssh_host "${node}")"
  if ! "${ssh_base[@]}" "${target}" "bash -lc $(printf '%q' "${REMOTE_BODY}")" >"${NODE_DIR}/${node}.txt" 2>"${NODE_DIR}/${node}.err"; then
    printf 'COLLECTION_FAILED\n' >"${NODE_DIR}/${node}.txt"
    cat "${NODE_DIR}/${node}.err" >>"${NODE_DIR}/${node}.txt" || true
  fi
done

set +e
python3 - "${VM_JSON}" "${NODES_JSON}" "${NODE_DIR}" "${JSON_OUT}" "${TEXT_OUT}" "${SEED_HOST}" <<'PY'
import json
import math
import os
import re
import sys
from collections import Counter, defaultdict
from datetime import datetime, timezone

vm_json, nodes_json, node_dir, json_out, text_out, seed_host = sys.argv[1:7]

def env_float(name, default):
    try:
        return float(os.environ.get(name, default))
    except Exception:
        return float(default)

def env_int(name, default):
    try:
        return int(float(os.environ.get(name, default)))
    except Exception:
        return int(default)

T = {
    "node_load_warn_per_cpu": env_float("CLUSTER_HEALTH_NODE_LOAD_WARN_PER_CPU", 0.90),
    "node_load_crit_per_cpu": env_float("CLUSTER_HEALTH_NODE_LOAD_CRIT_PER_CPU", 1.20),
    "node_mem_warn_pct": env_float("CLUSTER_HEALTH_NODE_MEM_WARN_PCT", 85),
    "node_mem_crit_pct": env_float("CLUSTER_HEALTH_NODE_MEM_CRIT_PCT", 92),
    "vz_warn_pct": env_float("CLUSTER_HEALTH_VZ_WARN_PCT", 85),
    "vz_crit_pct": env_float("CLUSTER_HEALTH_VZ_CRIT_PCT", 93),
    "psi_cpu_some_warn": env_float("CLUSTER_HEALTH_PSI_CPU_SOME_WARN", 10),
    "psi_cpu_some_crit": env_float("CLUSTER_HEALTH_PSI_CPU_SOME_CRIT", 20),
    "psi_io_full_warn": env_float("CLUSTER_HEALTH_PSI_IO_FULL_WARN", 10),
    "psi_io_full_crit": env_float("CLUSTER_HEALTH_PSI_IO_FULL_CRIT", 20),
    "psi_mem_full_warn": env_float("CLUSTER_HEALTH_PSI_MEM_FULL_WARN", 5),
    "psi_mem_full_crit": env_float("CLUSTER_HEALTH_PSI_MEM_FULL_CRIT", 10),
    "lxc_mem_warn_pct": env_float("CLUSTER_HEALTH_LXC_MEM_WARN_PCT", 85),
    "lxc_mem_crit_pct": env_float("CLUSTER_HEALTH_LXC_MEM_CRIT_PCT", 95),
    "lxc_cpu_warn_pct": env_float("CLUSTER_HEALTH_LXC_CPU_WARN_PCT", 20),
    "lxc_cpu_crit_pct": env_float("CLUSTER_HEALTH_LXC_CPU_CRIT_PCT", 40),
    "node_skew_warn_pct": env_float("CLUSTER_HEALTH_NODE_SKEW_WARN_PCT", 45),
    "node_skew_crit_pct": env_float("CLUSTER_HEALTH_NODE_SKEW_CRIT_PCT", 55),
    "summary_top_n": env_int("CLUSTER_HEALTH_SUMMARY_TOP_N", 8),
}

with open(vm_json, "r", encoding="utf-8") as fh:
    vm_rows = json.load(fh)
with open(nodes_json, "r", encoding="utf-8") as fh:
    node_rows = json.load(fh)

def parse_uptime_load(text):
    m = re.search(r"load average[s]?:\s*([0-9.]+),\s*([0-9.]+),\s*([0-9.]+)", text)
    if not m:
        return None
    return [float(m.group(1)), float(m.group(2)), float(m.group(3))]

def parse_free(text):
    for line in text.splitlines():
        if line.startswith("Mem:"):
            parts = line.split()
            if len(parts) >= 3:
                total = int(parts[1])
                used = int(parts[2])
                return {"total": total, "used": used}
    return None

def parse_psi(section):
    out = {}
    for line in section.splitlines():
        line = line.strip()
        if not line:
            continue
        kind, rest = line.split(None, 1)
        vals = {}
        for token in rest.split():
            key, value = token.split("=", 1)
            try:
                vals[key] = float(value)
            except ValueError:
                pass
        out[kind] = vals
    return out

def parse_df_vz(section):
    lines = [line for line in section.splitlines() if line.strip()]
    if len(lines) < 2:
        return None
    parts = lines[-1].split()
    if len(parts) < 6:
        return None
    return {
        "filesystem": parts[0],
        "size_bytes": int(parts[1]),
        "used_bytes": int(parts[2]),
        "avail_bytes": int(parts[3]),
        "use_pct": float(parts[4].rstrip("%")),
        "mountpoint": parts[5],
    }

def parse_pvesm(section):
    lines = [line.rstrip() for line in section.splitlines() if line.strip()]
    if len(lines) < 2:
        return []
    storage = []
    for line in lines[1:]:
        parts = line.split()
        if len(parts) < 6:
            continue
        item = {
            "name": parts[0],
            "type": parts[1],
            "status": parts[2],
            "total_bytes": None if parts[3] == "0" else int(parts[3]),
            "used_bytes": None if parts[4] == "0" else int(parts[4]),
            "available_bytes": None if parts[5] == "0" else int(parts[5]),
            "use_pct": None,
        }
        if item["total_bytes"] and item["used_bytes"] is not None:
            item["use_pct"] = round((item["used_bytes"] / item["total_bytes"]) * 100, 2)
        storage.append(item)
    return storage

def split_sections(text):
    sections = {}
    current = None
    bucket = []
    for raw in text.splitlines():
        line = raw.rstrip("\n")
        if line.startswith("__") and line.endswith("__"):
            if current is not None:
                sections[current] = "\n".join(bucket).strip()
            current = line.strip("_")
            bucket = []
            continue
        bucket.append(line)
    if current is not None:
        sections[current] = "\n".join(bucket).strip()
    return sections

severity_rank = {"ok": 0, "warn": 1, "crit": 2}
alerts = []

def add_alert(severity, scope, entity, metric, message, value=None, threshold=None):
    alerts.append({
        "severity": severity,
        "scope": scope,
        "entity": entity,
        "metric": metric,
        "message": message,
        "value": value,
        "threshold": threshold,
    })

node_metrics = {}
for path in sorted(os.listdir(node_dir)):
    if not path.endswith(".txt"):
        continue
    node_name = path[:-4]
    full_path = os.path.join(node_dir, path)
    text = open(full_path, "r", encoding="utf-8", errors="replace").read()
    if text.startswith("COLLECTION_FAILED"):
        node_metrics[node_name] = {
            "node": node_name,
            "collection_failed": True,
            "error": text.splitlines()[1:] if len(text.splitlines()) > 1 else [],
        }
        add_alert("crit", "node", node_name, "collection", f"{node_name} metrics collection failed")
        continue

    sections = split_sections(text)
    loads = parse_uptime_load(sections.get("UPTIME", ""))
    nproc = None
    try:
        nproc = int((sections.get("NPROC", "0").splitlines() or ["0"])[0].strip())
    except ValueError:
        nproc = 0
    free_mem = parse_free(sections.get("FREE", ""))
    psi = {
        "cpu": parse_psi(sections.get("PSI_CPU", "")),
        "io": parse_psi(sections.get("PSI_IO", "")),
        "memory": parse_psi(sections.get("PSI_MEMORY", "")),
    }
    df_vz = parse_df_vz(sections.get("DF_VZ", ""))
    pvesm = parse_pvesm(sections.get("PVESM", ""))

    metric = {
        "node": node_name,
        "hostname": (sections.get("HOSTNAME", node_name).splitlines() or [node_name])[0].strip(),
        "collection_failed": False,
        "loadavg": loads,
        "nproc": nproc,
        "load_per_cpu_1m": round(loads[0] / nproc, 3) if loads and nproc else None,
        "memory": None,
        "psi": psi,
        "df_vz": df_vz,
        "storage": pvesm,
    }
    if free_mem and free_mem["total"] > 0:
        metric["memory"] = {
            **free_mem,
            "used_pct": round((free_mem["used"] / free_mem["total"]) * 100, 2),
        }
    node_metrics[node_name] = metric

    lp = metric["load_per_cpu_1m"]
    if lp is not None:
        if lp >= T["node_load_crit_per_cpu"]:
            add_alert("crit", "node", node_name, "load_per_cpu_1m", f"{node_name} load/core is high", lp, T["node_load_crit_per_cpu"])
        elif lp >= T["node_load_warn_per_cpu"]:
            add_alert("warn", "node", node_name, "load_per_cpu_1m", f"{node_name} load/core is elevated", lp, T["node_load_warn_per_cpu"])

    mem = metric["memory"]
    if mem:
        if mem["used_pct"] >= T["node_mem_crit_pct"]:
            add_alert("crit", "node", node_name, "memory_used_pct", f"{node_name} host RAM usage is high", mem["used_pct"], T["node_mem_crit_pct"])
        elif mem["used_pct"] >= T["node_mem_warn_pct"]:
            add_alert("warn", "node", node_name, "memory_used_pct", f"{node_name} host RAM usage is elevated", mem["used_pct"], T["node_mem_warn_pct"])

    if df_vz:
        if df_vz["use_pct"] >= T["vz_crit_pct"]:
            add_alert("crit", "node", node_name, "vz_use_pct", f"{node_name} /var/lib/vz is near full", df_vz["use_pct"], T["vz_crit_pct"])
        elif df_vz["use_pct"] >= T["vz_warn_pct"]:
            add_alert("warn", "node", node_name, "vz_use_pct", f"{node_name} /var/lib/vz usage is elevated", df_vz["use_pct"], T["vz_warn_pct"])

    cpu_some = psi["cpu"].get("some", {}).get("avg10")
    if cpu_some is not None:
        if cpu_some >= T["psi_cpu_some_crit"]:
            add_alert("crit", "node", node_name, "psi_cpu_some_avg10", f"{node_name} CPU pressure is high", cpu_some, T["psi_cpu_some_crit"])
        elif cpu_some >= T["psi_cpu_some_warn"]:
            add_alert("warn", "node", node_name, "psi_cpu_some_avg10", f"{node_name} CPU pressure is elevated", cpu_some, T["psi_cpu_some_warn"])

    io_full = psi["io"].get("full", {}).get("avg10")
    if io_full is not None:
        if io_full >= T["psi_io_full_crit"]:
            add_alert("crit", "node", node_name, "psi_io_full_avg10", f"{node_name} I/O full pressure is high", io_full, T["psi_io_full_crit"])
        elif io_full >= T["psi_io_full_warn"]:
            add_alert("warn", "node", node_name, "psi_io_full_avg10", f"{node_name} I/O full pressure is elevated", io_full, T["psi_io_full_warn"])

    mem_full = psi["memory"].get("full", {}).get("avg10")
    if mem_full is not None:
        if mem_full >= T["psi_mem_full_crit"]:
            add_alert("crit", "node", node_name, "psi_memory_full_avg10", f"{node_name} memory full pressure is high", mem_full, T["psi_mem_full_crit"])
        elif mem_full >= T["psi_mem_full_warn"]:
            add_alert("warn", "node", node_name, "psi_memory_full_avg10", f"{node_name} memory full pressure is elevated", mem_full, T["psi_mem_full_warn"])

lxc_rows = [row for row in vm_rows if row.get("type") == "lxc"]
running_lxcs = [row for row in lxc_rows if row.get("status") == "running"]
stopped_lxcs = [row for row in lxc_rows if row.get("status") != "running"]

node_counts = Counter(row.get("node", "unknown") for row in running_lxcs)
running_total = len(running_lxcs)
for node, count in node_counts.items():
    pct = round((count / running_total) * 100, 2) if running_total else 0.0
    if pct >= T["node_skew_crit_pct"]:
        add_alert("crit", "cluster", node, "running_lxc_share_pct", f"{node} holds a large share of running LXCs", pct, T["node_skew_crit_pct"])
    elif pct >= T["node_skew_warn_pct"]:
        add_alert("warn", "cluster", node, "running_lxc_share_pct", f"{node} holds a high share of running LXCs", pct, T["node_skew_warn_pct"])

mem_hot = []
cpu_hot = []
disk_rw = []
network_totals = []
all_lxcs = []

for row in running_lxcs:
    maxmem = row.get("maxmem") or 0
    mem = row.get("mem") or 0
    mem_pct = round((mem / maxmem) * 100, 2) if maxmem else None
    maxcpu = row.get("maxcpu") or 0
    cpu_pct = round(float(row.get("cpu") or 0) * 100, 2)
    diskread = int(row.get("diskread") or 0)
    diskwrite = int(row.get("diskwrite") or 0)
    netin = int(row.get("netin") or 0)
    netout = int(row.get("netout") or 0)
    entry = {
        "vmid": row.get("vmid"),
        "name": row.get("name"),
        "node": row.get("node"),
        "cpu_pct": cpu_pct,
        "maxcpu": maxcpu,
        "mem_pct": mem_pct,
        "mem_bytes": mem,
        "maxmem_bytes": maxmem,
        "disk_pct": round(((row.get("disk") or 0) / row.get("maxdisk")) * 100, 2) if row.get("maxdisk") else None,
        "disk_bytes": int(row.get("disk") or 0),
        "maxdisk_bytes": int(row.get("maxdisk") or 0),
        "diskread_bytes": diskread,
        "diskwrite_bytes": diskwrite,
        "netin_bytes": netin,
        "netout_bytes": netout,
        "status": row.get("status"),
    }
    all_lxcs.append(entry)
    if mem_pct is not None:
        mem_hot.append(entry)
        if mem_pct >= T["lxc_mem_crit_pct"]:
            add_alert("crit", "lxc", f"{row.get('vmid')}:{row.get('name')}", "memory_used_pct", "LXC memory usage is high", mem_pct, T["lxc_mem_crit_pct"])
        elif mem_pct >= T["lxc_mem_warn_pct"]:
            add_alert("warn", "lxc", f"{row.get('vmid')}:{row.get('name')}", "memory_used_pct", "LXC memory usage is elevated", mem_pct, T["lxc_mem_warn_pct"])
    if cpu_pct >= T["lxc_cpu_crit_pct"]:
        add_alert("crit", "lxc", f"{row.get('vmid')}:{row.get('name')}", "cpu_pct", "LXC CPU usage is high", cpu_pct, T["lxc_cpu_crit_pct"])
    elif cpu_pct >= T["lxc_cpu_warn_pct"]:
        add_alert("warn", "lxc", f"{row.get('vmid')}:{row.get('name')}", "cpu_pct", "LXC CPU usage is elevated", cpu_pct, T["lxc_cpu_warn_pct"])
    cpu_hot.append(entry)
    disk_rw.append({**entry, "disk_total_bytes": diskread + diskwrite})
    network_totals.append({**entry, "network_total_bytes": netin + netout})

mem_hot.sort(key=lambda x: (-1 if x["mem_pct"] is None else -x["mem_pct"], x["vmid"]))
cpu_hot.sort(key=lambda x: (-x["cpu_pct"], x["vmid"]))
disk_rw.sort(key=lambda x: (-x["disk_total_bytes"], x["vmid"]))
network_totals.sort(key=lambda x: (-x["network_total_bytes"], x["vmid"]))

alerts.sort(key=lambda a: (-severity_rank[a["severity"]], a["scope"], str(a["entity"]), a["metric"]))

overall = "ok"
if any(a["severity"] == "crit" for a in alerts):
    overall = "crit"
elif any(a["severity"] == "warn" for a in alerts):
    overall = "warn"

node_summary = []
for row in node_rows:
    node_name = row.get("node")
    metric = node_metrics.get(node_name, {"collection_failed": True})
    node_summary.append({
        "node": node_name,
        "status": row.get("status"),
        "running_lxcs": node_counts.get(node_name, 0),
        "cluster_cpu_fraction_pct": round(float(row.get("cpu") or 0) * 100, 2) if row.get("cpu") is not None else None,
        "cluster_mem_fraction_pct": round(((row.get("mem") or 0) / row.get("maxmem")) * 100, 2) if row.get("maxmem") else None,
        "loadavg_1m": metric.get("loadavg", [None])[0] if not metric.get("collection_failed") else None,
        "load_per_cpu_1m": metric.get("load_per_cpu_1m"),
        "host_mem_used_pct": metric.get("memory", {}).get("used_pct") if not metric.get("collection_failed") else None,
        "psi_cpu_some_avg10": metric.get("psi", {}).get("cpu", {}).get("some", {}).get("avg10") if not metric.get("collection_failed") else None,
        "psi_io_full_avg10": metric.get("psi", {}).get("io", {}).get("full", {}).get("avg10") if not metric.get("collection_failed") else None,
        "psi_memory_full_avg10": metric.get("psi", {}).get("memory", {}).get("full", {}).get("avg10") if not metric.get("collection_failed") else None,
        "vz_use_pct": metric.get("df_vz", {}).get("use_pct") if not metric.get("collection_failed") else None,
    })

report = {
    "collected_at": datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ"),
    "seed_host": seed_host,
    "overall_status": overall,
    "thresholds": T,
    "cluster": {
        "total_lxcs": len(lxc_rows),
        "running_lxcs": running_total,
        "stopped_lxcs": len(stopped_lxcs),
        "running_distribution": [
            {
                "node": node,
                "running_lxcs": count,
                "share_pct": round((count / running_total) * 100, 2) if running_total else 0.0,
            }
            for node, count in sorted(node_counts.items())
        ],
    },
    "nodes": node_summary,
    "node_metrics": node_metrics,
    "top_lxcs": {
        "memory_pct": mem_hot[:T["summary_top_n"]],
        "cpu_pct": cpu_hot[:T["summary_top_n"]],
        "disk_total_bytes": disk_rw[:T["summary_top_n"]],
        "network_total_bytes": network_totals[:T["summary_top_n"]],
    },
    "lxcs": sorted(all_lxcs, key=lambda x: (str(x["node"]), int(x["vmid"]))),
    "alerts": alerts,
}

with open(json_out, "w", encoding="utf-8") as fh:
    json.dump(report, fh, indent=2)
    fh.write("\n")

def gib(n):
    return round(n / (1024 ** 3), 2)

lines = []
lines.append(f"LXC cluster health {report['collected_at']} ({overall.upper()})")
lines.append(f"Seed host: {report['seed_host']}")
lines.append(f"LXCs: running {running_total} / total {len(lxc_rows)} / stopped {len(stopped_lxcs)}")
lines.append("")
lines.append("Node summary:")
for item in node_summary:
    lines.append(
        f"- {item['node']}: running_lxcs={item['running_lxcs']}, "
        f"load1={item['loadavg_1m'] if item['loadavg_1m'] is not None else 'n/a'}, "
        f"load/core={item['load_per_cpu_1m'] if item['load_per_cpu_1m'] is not None else 'n/a'}, "
        f"host_mem={item['host_mem_used_pct'] if item['host_mem_used_pct'] is not None else 'n/a'}%, "
        f"psi_io_full_avg10={item['psi_io_full_avg10'] if item['psi_io_full_avg10'] is not None else 'n/a'}, "
        f"vz={item['vz_use_pct'] if item['vz_use_pct'] is not None else 'n/a'}%"
    )
lines.append("")
lines.append("Top findings:")
if alerts:
    for alert in alerts[: max(T["summary_top_n"], 10)]:
        value = "" if alert["value"] is None else f" value={alert['value']}"
        threshold = "" if alert["threshold"] is None else f" threshold={alert['threshold']}"
        lines.append(f"- [{alert['severity'].upper()}] {alert['scope']} {alert['entity']} {alert['metric']}: {alert['message']}{value}{threshold}")
else:
    lines.append("- none")

lines.append("")
lines.append("Top LXC memory:")
for item in mem_hot[: T["summary_top_n"]]:
    lines.append(f"- {item['vmid']} {item['name']} @ {item['node']}: mem={item['mem_pct']}% of {gib(item['maxmem_bytes'])} GiB, cpu={item['cpu_pct']}%")

lines.append("")
lines.append("Top LXC CPU:")
for item in cpu_hot[: T["summary_top_n"]]:
    lines.append(f"- {item['vmid']} {item['name']} @ {item['node']}: cpu={item['cpu_pct']}%, mem={item['mem_pct']}%")

lines.append("")
lines.append("Top cumulative disk:")
for item in disk_rw[: T["summary_top_n"]]:
    lines.append(f"- {item['vmid']} {item['name']} @ {item['node']}: read+write={gib(item['disk_total_bytes'])} GiB")

lines.append("")
lines.append("Top cumulative network:")
for item in network_totals[: T["summary_top_n"]]:
    lines.append(f"- {item['vmid']} {item['name']} @ {item['node']}: in+out={gib(item['network_total_bytes'])} GiB")

with open(text_out, "w", encoding="utf-8") as fh:
    fh.write("\n".join(lines).rstrip() + "\n")

print(json.dumps({
    "overall_status": overall,
    "json_report": json_out,
    "text_report": text_out,
    "alerts": len(alerts),
}))

if overall == "crit":
    sys.exit(2)
if overall == "warn":
    sys.exit(1)
sys.exit(0)
PY

RC=$?
set -e

if [[ "${JSON_ONLY}" -eq 1 ]]; then
  cat "${JSON_OUT}"
else
  cat "${TEXT_OUT}"
  echo
  echo "JSON: ${JSON_OUT}"
  echo "Text: ${TEXT_OUT}"
fi

exit "${RC}"