Files
proxmox/scripts/verify/poll-lxc-cluster-health.sh
defiQUG dbd517b279 Sync workspace: config, docs, scripts, CI, operator rules, and submodule pointers.
- Update dbis_core, cross-chain-pmm-lps, explorer-monorepo, metamask-integration, pr-workspace/chains
- Omit embedded publish git dirs and empty placeholders from index

Made-with: Cursor
2026-04-12 06:12:20 -07:00

614 lines
23 KiB
Bash
Executable File

#!/usr/bin/env bash
# Poll Proxmox LXC cluster health over SSH with key auth.
# Collects:
# - /cluster/resources VM inventory (LXCs only) from a seed Proxmox host
# - Per-node load, RAM, PSI, /var/lib/vz usage, and pvesm status
# Emits:
# - Timestamped JSON report under reports/status/
# - Human-readable summary text next to the JSON
# Exit codes:
# 0 = OK
# 1 = WARN findings present
# 2 = CRIT findings present
# 3 = Collection failure / seed unreachable
#
# Usage:
# bash scripts/verify/poll-lxc-cluster-health.sh
# SEED_HOST=192.168.11.11 bash scripts/verify/poll-lxc-cluster-health.sh --json
# OUT_DIR=/tmp bash scripts/verify/poll-lxc-cluster-health.sh
#
# Threshold env overrides:
# CLUSTER_HEALTH_NODE_LOAD_WARN_PER_CPU=0.90
# CLUSTER_HEALTH_NODE_LOAD_CRIT_PER_CPU=1.20
# CLUSTER_HEALTH_NODE_MEM_WARN_PCT=85
# CLUSTER_HEALTH_NODE_MEM_CRIT_PCT=92
# CLUSTER_HEALTH_VZ_WARN_PCT=85
# CLUSTER_HEALTH_VZ_CRIT_PCT=93
# CLUSTER_HEALTH_PSI_CPU_SOME_WARN=10
# CLUSTER_HEALTH_PSI_CPU_SOME_CRIT=20
# CLUSTER_HEALTH_PSI_IO_FULL_WARN=10
# CLUSTER_HEALTH_PSI_IO_FULL_CRIT=20
# CLUSTER_HEALTH_PSI_MEM_FULL_WARN=5
# CLUSTER_HEALTH_PSI_MEM_FULL_CRIT=10
# CLUSTER_HEALTH_LXC_MEM_WARN_PCT=85
# CLUSTER_HEALTH_LXC_MEM_CRIT_PCT=95
# CLUSTER_HEALTH_LXC_CPU_WARN_PCT=20
# CLUSTER_HEALTH_LXC_CPU_CRIT_PCT=40
# CLUSTER_HEALTH_NODE_SKEW_WARN_PCT=45
# CLUSTER_HEALTH_NODE_SKEW_CRIT_PCT=55
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
PROJECT_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)"
# shellcheck source=/dev/null
source "${PROJECT_ROOT}/scripts/lib/load-project-env.sh"
JSON_ONLY=0
case "${1:-}" in
--json) JSON_ONLY=1 ;;
"" ) ;;
-h|--help)
sed -n '1,48p' "$0"
exit 0
;;
*)
echo "ERROR: unknown argument: ${1}" >&2
exit 2
;;
esac
PROXMOX_SSH_USER="${PROXMOX_SSH_USER:-root}"
SEED_HOST="${SEED_HOST:-${PROXMOX_HOST_R630_01:-192.168.11.11}}"
OUT_DIR="${OUT_DIR:-${PROJECT_ROOT}/reports/status}"
TS="$(date +%Y%m%d_%H%M%S)"
JSON_OUT="${JSON_OUT:-${OUT_DIR}/lxc_cluster_health_${TS}.json}"
TEXT_OUT="${TEXT_OUT:-${OUT_DIR}/lxc_cluster_health_${TS}.txt}"
mkdir -p "${OUT_DIR}"
TMP_DIR="$(mktemp -d)"
trap 'rm -rf "${TMP_DIR}"' EXIT
VM_JSON="${TMP_DIR}/cluster_resources_vm.json"
NODES_JSON="${TMP_DIR}/cluster_resources_node.json"
NODE_DIR="${TMP_DIR}/nodes"
mkdir -p "${NODE_DIR}"
ssh_base=(
ssh
-o BatchMode=yes
-o ConnectTimeout=15
-o StrictHostKeyChecking=no
)
SEED_TARGET="${PROXMOX_SSH_USER}@${SEED_HOST}"
node_ssh_host() {
case "$1" in
ml110) printf '%s\n' "${PROXMOX_HOST_ML110:-$1}" ;;
r630-01) printf '%s\n' "${PROXMOX_HOST_R630_01:-$1}" ;;
r630-02) printf '%s\n' "${PROXMOX_HOST_R630_02:-$1}" ;;
r630-03) printf '%s\n' "${PROXMOX_HOST_R630_03:-$1}" ;;
r630-04) printf '%s\n' "${PROXMOX_HOST_R630_04:-$1}" ;;
*) printf '%s\n' "$1" ;;
esac
}
if ! ping -c1 -W2 "${SEED_HOST}" >/dev/null 2>&1; then
echo "ERROR: seed unreachable: ${SEED_HOST}" >&2
exit 3
fi
if ! "${ssh_base[@]}" "${SEED_TARGET}" "pvesh get /cluster/resources --type vm --output-format json" >"${VM_JSON}" 2>"${TMP_DIR}/seed_vm.err"; then
echo "ERROR: failed to query VM resources from ${SEED_HOST}" >&2
cat "${TMP_DIR}/seed_vm.err" >&2 || true
exit 3
fi
if ! "${ssh_base[@]}" "${SEED_TARGET}" "pvesh get /cluster/resources --type node --output-format json" >"${NODES_JSON}" 2>"${TMP_DIR}/seed_nodes.err"; then
echo "ERROR: failed to query node resources from ${SEED_HOST}" >&2
cat "${TMP_DIR}/seed_nodes.err" >&2 || true
exit 3
fi
mapfile -t NODE_ROWS < <(
python3 - "${NODES_JSON}" <<'PY'
import json, sys
with open(sys.argv[1], 'r', encoding='utf-8') as fh:
data = json.load(fh)
for row in data:
node = row.get("node")
if node:
print(node)
PY
)
if [[ "${#NODE_ROWS[@]}" -eq 0 ]]; then
echo "ERROR: no Proxmox nodes returned by cluster resources" >&2
exit 3
fi
REMOTE_BODY=$(cat <<'EOS'
set -euo pipefail
echo "__HOSTNAME__"
hostname -s 2>/dev/null || hostname
echo "__UPTIME__"
uptime
echo "__NPROC__"
nproc 2>/dev/null || getconf _NPROCESSORS_ONLN || echo 0
echo "__FREE__"
free -b
echo "__PSI_CPU__"
cat /proc/pressure/cpu 2>/dev/null || true
echo "__PSI_IO__"
cat /proc/pressure/io 2>/dev/null || true
echo "__PSI_MEMORY__"
cat /proc/pressure/memory 2>/dev/null || true
echo "__DF_VZ__"
df -B1 -P /var/lib/vz 2>/dev/null || true
echo "__PVESM__"
pvesm status 2>/dev/null || true
EOS
)
for node in "${NODE_ROWS[@]}"; do
target="${PROXMOX_SSH_USER}@$(node_ssh_host "${node}")"
if ! "${ssh_base[@]}" "${target}" "bash -lc $(printf '%q' "${REMOTE_BODY}")" >"${NODE_DIR}/${node}.txt" 2>"${NODE_DIR}/${node}.err"; then
printf 'COLLECTION_FAILED\n' >"${NODE_DIR}/${node}.txt"
cat "${NODE_DIR}/${node}.err" >>"${NODE_DIR}/${node}.txt" || true
fi
done
set +e
python3 - "${VM_JSON}" "${NODES_JSON}" "${NODE_DIR}" "${JSON_OUT}" "${TEXT_OUT}" "${SEED_HOST}" <<'PY'
import json
import math
import os
import re
import sys
from collections import Counter, defaultdict
from datetime import datetime, timezone
vm_json, nodes_json, node_dir, json_out, text_out, seed_host = sys.argv[1:7]
def env_float(name, default):
try:
return float(os.environ.get(name, default))
except Exception:
return float(default)
def env_int(name, default):
try:
return int(float(os.environ.get(name, default)))
except Exception:
return int(default)
T = {
"node_load_warn_per_cpu": env_float("CLUSTER_HEALTH_NODE_LOAD_WARN_PER_CPU", 0.90),
"node_load_crit_per_cpu": env_float("CLUSTER_HEALTH_NODE_LOAD_CRIT_PER_CPU", 1.20),
"node_mem_warn_pct": env_float("CLUSTER_HEALTH_NODE_MEM_WARN_PCT", 85),
"node_mem_crit_pct": env_float("CLUSTER_HEALTH_NODE_MEM_CRIT_PCT", 92),
"vz_warn_pct": env_float("CLUSTER_HEALTH_VZ_WARN_PCT", 85),
"vz_crit_pct": env_float("CLUSTER_HEALTH_VZ_CRIT_PCT", 93),
"psi_cpu_some_warn": env_float("CLUSTER_HEALTH_PSI_CPU_SOME_WARN", 10),
"psi_cpu_some_crit": env_float("CLUSTER_HEALTH_PSI_CPU_SOME_CRIT", 20),
"psi_io_full_warn": env_float("CLUSTER_HEALTH_PSI_IO_FULL_WARN", 10),
"psi_io_full_crit": env_float("CLUSTER_HEALTH_PSI_IO_FULL_CRIT", 20),
"psi_mem_full_warn": env_float("CLUSTER_HEALTH_PSI_MEM_FULL_WARN", 5),
"psi_mem_full_crit": env_float("CLUSTER_HEALTH_PSI_MEM_FULL_CRIT", 10),
"lxc_mem_warn_pct": env_float("CLUSTER_HEALTH_LXC_MEM_WARN_PCT", 85),
"lxc_mem_crit_pct": env_float("CLUSTER_HEALTH_LXC_MEM_CRIT_PCT", 95),
"lxc_cpu_warn_pct": env_float("CLUSTER_HEALTH_LXC_CPU_WARN_PCT", 20),
"lxc_cpu_crit_pct": env_float("CLUSTER_HEALTH_LXC_CPU_CRIT_PCT", 40),
"node_skew_warn_pct": env_float("CLUSTER_HEALTH_NODE_SKEW_WARN_PCT", 45),
"node_skew_crit_pct": env_float("CLUSTER_HEALTH_NODE_SKEW_CRIT_PCT", 55),
"summary_top_n": env_int("CLUSTER_HEALTH_SUMMARY_TOP_N", 8),
}
with open(vm_json, "r", encoding="utf-8") as fh:
vm_rows = json.load(fh)
with open(nodes_json, "r", encoding="utf-8") as fh:
node_rows = json.load(fh)
def parse_uptime_load(text):
m = re.search(r"load average[s]?:\s*([0-9.]+),\s*([0-9.]+),\s*([0-9.]+)", text)
if not m:
return None
return [float(m.group(1)), float(m.group(2)), float(m.group(3))]
def parse_free(text):
for line in text.splitlines():
if line.startswith("Mem:"):
parts = line.split()
if len(parts) >= 3:
total = int(parts[1])
used = int(parts[2])
return {"total": total, "used": used}
return None
def parse_psi(section):
out = {}
for line in section.splitlines():
line = line.strip()
if not line:
continue
kind, rest = line.split(None, 1)
vals = {}
for token in rest.split():
key, value = token.split("=", 1)
try:
vals[key] = float(value)
except ValueError:
pass
out[kind] = vals
return out
def parse_df_vz(section):
lines = [line for line in section.splitlines() if line.strip()]
if len(lines) < 2:
return None
parts = lines[-1].split()
if len(parts) < 6:
return None
return {
"filesystem": parts[0],
"size_bytes": int(parts[1]),
"used_bytes": int(parts[2]),
"avail_bytes": int(parts[3]),
"use_pct": float(parts[4].rstrip("%")),
"mountpoint": parts[5],
}
def parse_pvesm(section):
lines = [line.rstrip() for line in section.splitlines() if line.strip()]
if len(lines) < 2:
return []
storage = []
for line in lines[1:]:
parts = line.split()
if len(parts) < 6:
continue
item = {
"name": parts[0],
"type": parts[1],
"status": parts[2],
"total_bytes": None if parts[3] == "0" else int(parts[3]),
"used_bytes": None if parts[4] == "0" else int(parts[4]),
"available_bytes": None if parts[5] == "0" else int(parts[5]),
"use_pct": None,
}
if item["total_bytes"] and item["used_bytes"] is not None:
item["use_pct"] = round((item["used_bytes"] / item["total_bytes"]) * 100, 2)
storage.append(item)
return storage
def split_sections(text):
sections = {}
current = None
bucket = []
for raw in text.splitlines():
line = raw.rstrip("\n")
if line.startswith("__") and line.endswith("__"):
if current is not None:
sections[current] = "\n".join(bucket).strip()
current = line.strip("_")
bucket = []
continue
bucket.append(line)
if current is not None:
sections[current] = "\n".join(bucket).strip()
return sections
severity_rank = {"ok": 0, "warn": 1, "crit": 2}
alerts = []
def add_alert(severity, scope, entity, metric, message, value=None, threshold=None):
alerts.append({
"severity": severity,
"scope": scope,
"entity": entity,
"metric": metric,
"message": message,
"value": value,
"threshold": threshold,
})
node_metrics = {}
for path in sorted(os.listdir(node_dir)):
if not path.endswith(".txt"):
continue
node_name = path[:-4]
full_path = os.path.join(node_dir, path)
text = open(full_path, "r", encoding="utf-8", errors="replace").read()
if text.startswith("COLLECTION_FAILED"):
node_metrics[node_name] = {
"node": node_name,
"collection_failed": True,
"error": text.splitlines()[1:] if len(text.splitlines()) > 1 else [],
}
add_alert("crit", "node", node_name, "collection", f"{node_name} metrics collection failed")
continue
sections = split_sections(text)
loads = parse_uptime_load(sections.get("UPTIME", ""))
nproc = None
try:
nproc = int((sections.get("NPROC", "0").splitlines() or ["0"])[0].strip())
except ValueError:
nproc = 0
free_mem = parse_free(sections.get("FREE", ""))
psi = {
"cpu": parse_psi(sections.get("PSI_CPU", "")),
"io": parse_psi(sections.get("PSI_IO", "")),
"memory": parse_psi(sections.get("PSI_MEMORY", "")),
}
df_vz = parse_df_vz(sections.get("DF_VZ", ""))
pvesm = parse_pvesm(sections.get("PVESM", ""))
metric = {
"node": node_name,
"hostname": (sections.get("HOSTNAME", node_name).splitlines() or [node_name])[0].strip(),
"collection_failed": False,
"loadavg": loads,
"nproc": nproc,
"load_per_cpu_1m": round(loads[0] / nproc, 3) if loads and nproc else None,
"memory": None,
"psi": psi,
"df_vz": df_vz,
"storage": pvesm,
}
if free_mem and free_mem["total"] > 0:
metric["memory"] = {
**free_mem,
"used_pct": round((free_mem["used"] / free_mem["total"]) * 100, 2),
}
node_metrics[node_name] = metric
lp = metric["load_per_cpu_1m"]
if lp is not None:
if lp >= T["node_load_crit_per_cpu"]:
add_alert("crit", "node", node_name, "load_per_cpu_1m", f"{node_name} load/core is high", lp, T["node_load_crit_per_cpu"])
elif lp >= T["node_load_warn_per_cpu"]:
add_alert("warn", "node", node_name, "load_per_cpu_1m", f"{node_name} load/core is elevated", lp, T["node_load_warn_per_cpu"])
mem = metric["memory"]
if mem:
if mem["used_pct"] >= T["node_mem_crit_pct"]:
add_alert("crit", "node", node_name, "memory_used_pct", f"{node_name} host RAM usage is high", mem["used_pct"], T["node_mem_crit_pct"])
elif mem["used_pct"] >= T["node_mem_warn_pct"]:
add_alert("warn", "node", node_name, "memory_used_pct", f"{node_name} host RAM usage is elevated", mem["used_pct"], T["node_mem_warn_pct"])
if df_vz:
if df_vz["use_pct"] >= T["vz_crit_pct"]:
add_alert("crit", "node", node_name, "vz_use_pct", f"{node_name} /var/lib/vz is near full", df_vz["use_pct"], T["vz_crit_pct"])
elif df_vz["use_pct"] >= T["vz_warn_pct"]:
add_alert("warn", "node", node_name, "vz_use_pct", f"{node_name} /var/lib/vz usage is elevated", df_vz["use_pct"], T["vz_warn_pct"])
cpu_some = psi["cpu"].get("some", {}).get("avg10")
if cpu_some is not None:
if cpu_some >= T["psi_cpu_some_crit"]:
add_alert("crit", "node", node_name, "psi_cpu_some_avg10", f"{node_name} CPU pressure is high", cpu_some, T["psi_cpu_some_crit"])
elif cpu_some >= T["psi_cpu_some_warn"]:
add_alert("warn", "node", node_name, "psi_cpu_some_avg10", f"{node_name} CPU pressure is elevated", cpu_some, T["psi_cpu_some_warn"])
io_full = psi["io"].get("full", {}).get("avg10")
if io_full is not None:
if io_full >= T["psi_io_full_crit"]:
add_alert("crit", "node", node_name, "psi_io_full_avg10", f"{node_name} I/O full pressure is high", io_full, T["psi_io_full_crit"])
elif io_full >= T["psi_io_full_warn"]:
add_alert("warn", "node", node_name, "psi_io_full_avg10", f"{node_name} I/O full pressure is elevated", io_full, T["psi_io_full_warn"])
mem_full = psi["memory"].get("full", {}).get("avg10")
if mem_full is not None:
if mem_full >= T["psi_mem_full_crit"]:
add_alert("crit", "node", node_name, "psi_memory_full_avg10", f"{node_name} memory full pressure is high", mem_full, T["psi_mem_full_crit"])
elif mem_full >= T["psi_mem_full_warn"]:
add_alert("warn", "node", node_name, "psi_memory_full_avg10", f"{node_name} memory full pressure is elevated", mem_full, T["psi_mem_full_warn"])
lxc_rows = [row for row in vm_rows if row.get("type") == "lxc"]
running_lxcs = [row for row in lxc_rows if row.get("status") == "running"]
stopped_lxcs = [row for row in lxc_rows if row.get("status") != "running"]
node_counts = Counter(row.get("node", "unknown") for row in running_lxcs)
running_total = len(running_lxcs)
for node, count in node_counts.items():
pct = round((count / running_total) * 100, 2) if running_total else 0.0
if pct >= T["node_skew_crit_pct"]:
add_alert("crit", "cluster", node, "running_lxc_share_pct", f"{node} holds a large share of running LXCs", pct, T["node_skew_crit_pct"])
elif pct >= T["node_skew_warn_pct"]:
add_alert("warn", "cluster", node, "running_lxc_share_pct", f"{node} holds a high share of running LXCs", pct, T["node_skew_warn_pct"])
mem_hot = []
cpu_hot = []
disk_rw = []
network_totals = []
all_lxcs = []
for row in running_lxcs:
maxmem = row.get("maxmem") or 0
mem = row.get("mem") or 0
mem_pct = round((mem / maxmem) * 100, 2) if maxmem else None
maxcpu = row.get("maxcpu") or 0
cpu_pct = round(float(row.get("cpu") or 0) * 100, 2)
diskread = int(row.get("diskread") or 0)
diskwrite = int(row.get("diskwrite") or 0)
netin = int(row.get("netin") or 0)
netout = int(row.get("netout") or 0)
entry = {
"vmid": row.get("vmid"),
"name": row.get("name"),
"node": row.get("node"),
"cpu_pct": cpu_pct,
"maxcpu": maxcpu,
"mem_pct": mem_pct,
"mem_bytes": mem,
"maxmem_bytes": maxmem,
"disk_pct": round(((row.get("disk") or 0) / row.get("maxdisk")) * 100, 2) if row.get("maxdisk") else None,
"disk_bytes": int(row.get("disk") or 0),
"maxdisk_bytes": int(row.get("maxdisk") or 0),
"diskread_bytes": diskread,
"diskwrite_bytes": diskwrite,
"netin_bytes": netin,
"netout_bytes": netout,
"status": row.get("status"),
}
all_lxcs.append(entry)
if mem_pct is not None:
mem_hot.append(entry)
if mem_pct >= T["lxc_mem_crit_pct"]:
add_alert("crit", "lxc", f"{row.get('vmid')}:{row.get('name')}", "memory_used_pct", "LXC memory usage is high", mem_pct, T["lxc_mem_crit_pct"])
elif mem_pct >= T["lxc_mem_warn_pct"]:
add_alert("warn", "lxc", f"{row.get('vmid')}:{row.get('name')}", "memory_used_pct", "LXC memory usage is elevated", mem_pct, T["lxc_mem_warn_pct"])
if cpu_pct >= T["lxc_cpu_crit_pct"]:
add_alert("crit", "lxc", f"{row.get('vmid')}:{row.get('name')}", "cpu_pct", "LXC CPU usage is high", cpu_pct, T["lxc_cpu_crit_pct"])
elif cpu_pct >= T["lxc_cpu_warn_pct"]:
add_alert("warn", "lxc", f"{row.get('vmid')}:{row.get('name')}", "cpu_pct", "LXC CPU usage is elevated", cpu_pct, T["lxc_cpu_warn_pct"])
cpu_hot.append(entry)
disk_rw.append({**entry, "disk_total_bytes": diskread + diskwrite})
network_totals.append({**entry, "network_total_bytes": netin + netout})
mem_hot.sort(key=lambda x: (-1 if x["mem_pct"] is None else -x["mem_pct"], x["vmid"]))
cpu_hot.sort(key=lambda x: (-x["cpu_pct"], x["vmid"]))
disk_rw.sort(key=lambda x: (-x["disk_total_bytes"], x["vmid"]))
network_totals.sort(key=lambda x: (-x["network_total_bytes"], x["vmid"]))
alerts.sort(key=lambda a: (-severity_rank[a["severity"]], a["scope"], str(a["entity"]), a["metric"]))
overall = "ok"
if any(a["severity"] == "crit" for a in alerts):
overall = "crit"
elif any(a["severity"] == "warn" for a in alerts):
overall = "warn"
node_summary = []
for row in node_rows:
node_name = row.get("node")
metric = node_metrics.get(node_name, {"collection_failed": True})
node_summary.append({
"node": node_name,
"status": row.get("status"),
"running_lxcs": node_counts.get(node_name, 0),
"cluster_cpu_fraction_pct": round(float(row.get("cpu") or 0) * 100, 2) if row.get("cpu") is not None else None,
"cluster_mem_fraction_pct": round(((row.get("mem") or 0) / row.get("maxmem")) * 100, 2) if row.get("maxmem") else None,
"loadavg_1m": metric.get("loadavg", [None])[0] if not metric.get("collection_failed") else None,
"load_per_cpu_1m": metric.get("load_per_cpu_1m"),
"host_mem_used_pct": metric.get("memory", {}).get("used_pct") if not metric.get("collection_failed") else None,
"psi_cpu_some_avg10": metric.get("psi", {}).get("cpu", {}).get("some", {}).get("avg10") if not metric.get("collection_failed") else None,
"psi_io_full_avg10": metric.get("psi", {}).get("io", {}).get("full", {}).get("avg10") if not metric.get("collection_failed") else None,
"psi_memory_full_avg10": metric.get("psi", {}).get("memory", {}).get("full", {}).get("avg10") if not metric.get("collection_failed") else None,
"vz_use_pct": metric.get("df_vz", {}).get("use_pct") if not metric.get("collection_failed") else None,
})
report = {
"collected_at": datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ"),
"seed_host": seed_host,
"overall_status": overall,
"thresholds": T,
"cluster": {
"total_lxcs": len(lxc_rows),
"running_lxcs": running_total,
"stopped_lxcs": len(stopped_lxcs),
"running_distribution": [
{
"node": node,
"running_lxcs": count,
"share_pct": round((count / running_total) * 100, 2) if running_total else 0.0,
}
for node, count in sorted(node_counts.items())
],
},
"nodes": node_summary,
"node_metrics": node_metrics,
"top_lxcs": {
"memory_pct": mem_hot[:T["summary_top_n"]],
"cpu_pct": cpu_hot[:T["summary_top_n"]],
"disk_total_bytes": disk_rw[:T["summary_top_n"]],
"network_total_bytes": network_totals[:T["summary_top_n"]],
},
"lxcs": sorted(all_lxcs, key=lambda x: (str(x["node"]), int(x["vmid"]))),
"alerts": alerts,
}
with open(json_out, "w", encoding="utf-8") as fh:
json.dump(report, fh, indent=2)
fh.write("\n")
def gib(n):
return round(n / (1024 ** 3), 2)
lines = []
lines.append(f"LXC cluster health {report['collected_at']} ({overall.upper()})")
lines.append(f"Seed host: {report['seed_host']}")
lines.append(f"LXCs: running {running_total} / total {len(lxc_rows)} / stopped {len(stopped_lxcs)}")
lines.append("")
lines.append("Node summary:")
for item in node_summary:
lines.append(
f"- {item['node']}: running_lxcs={item['running_lxcs']}, "
f"load1={item['loadavg_1m'] if item['loadavg_1m'] is not None else 'n/a'}, "
f"load/core={item['load_per_cpu_1m'] if item['load_per_cpu_1m'] is not None else 'n/a'}, "
f"host_mem={item['host_mem_used_pct'] if item['host_mem_used_pct'] is not None else 'n/a'}%, "
f"psi_io_full_avg10={item['psi_io_full_avg10'] if item['psi_io_full_avg10'] is not None else 'n/a'}, "
f"vz={item['vz_use_pct'] if item['vz_use_pct'] is not None else 'n/a'}%"
)
lines.append("")
lines.append("Top findings:")
if alerts:
for alert in alerts[: max(T["summary_top_n"], 10)]:
value = "" if alert["value"] is None else f" value={alert['value']}"
threshold = "" if alert["threshold"] is None else f" threshold={alert['threshold']}"
lines.append(f"- [{alert['severity'].upper()}] {alert['scope']} {alert['entity']} {alert['metric']}: {alert['message']}{value}{threshold}")
else:
lines.append("- none")
lines.append("")
lines.append("Top LXC memory:")
for item in mem_hot[: T["summary_top_n"]]:
lines.append(f"- {item['vmid']} {item['name']} @ {item['node']}: mem={item['mem_pct']}% of {gib(item['maxmem_bytes'])} GiB, cpu={item['cpu_pct']}%")
lines.append("")
lines.append("Top LXC CPU:")
for item in cpu_hot[: T["summary_top_n"]]:
lines.append(f"- {item['vmid']} {item['name']} @ {item['node']}: cpu={item['cpu_pct']}%, mem={item['mem_pct']}%")
lines.append("")
lines.append("Top cumulative disk:")
for item in disk_rw[: T["summary_top_n"]]:
lines.append(f"- {item['vmid']} {item['name']} @ {item['node']}: read+write={gib(item['disk_total_bytes'])} GiB")
lines.append("")
lines.append("Top cumulative network:")
for item in network_totals[: T["summary_top_n"]]:
lines.append(f"- {item['vmid']} {item['name']} @ {item['node']}: in+out={gib(item['network_total_bytes'])} GiB")
with open(text_out, "w", encoding="utf-8") as fh:
fh.write("\n".join(lines).rstrip() + "\n")
print(json.dumps({
"overall_status": overall,
"json_report": json_out,
"text_report": text_out,
"alerts": len(alerts),
}))
if overall == "crit":
sys.exit(2)
if overall == "warn":
sys.exit(1)
sys.exit(0)
PY
RC=$?
set -e
if [[ "${JSON_ONLY}" -eq 1 ]]; then
cat "${JSON_OUT}"
else
cat "${TEXT_OUT}"
echo
echo "JSON: ${JSON_OUT}"
echo "Text: ${TEXT_OUT}"
fi
exit "${RC}"