- Update dbis_core, cross-chain-pmm-lps, explorer-monorepo, metamask-integration, pr-workspace/chains - Omit embedded publish git dirs and empty placeholders from index Made-with: Cursor
614 lines
23 KiB
Bash
Executable File
614 lines
23 KiB
Bash
Executable File
#!/usr/bin/env bash
|
|
# Poll Proxmox LXC cluster health over SSH with key auth.
|
|
# Collects:
|
|
# - /cluster/resources VM inventory (LXCs only) from a seed Proxmox host
|
|
# - Per-node load, RAM, PSI, /var/lib/vz usage, and pvesm status
|
|
# Emits:
|
|
# - Timestamped JSON report under reports/status/
|
|
# - Human-readable summary text next to the JSON
|
|
# Exit codes:
|
|
# 0 = OK
|
|
# 1 = WARN findings present
|
|
# 2 = CRIT findings present
|
|
# 3 = Collection failure / seed unreachable
|
|
#
|
|
# Usage:
|
|
# bash scripts/verify/poll-lxc-cluster-health.sh
|
|
# SEED_HOST=192.168.11.11 bash scripts/verify/poll-lxc-cluster-health.sh --json
|
|
# OUT_DIR=/tmp bash scripts/verify/poll-lxc-cluster-health.sh
|
|
#
|
|
# Threshold env overrides:
|
|
# CLUSTER_HEALTH_NODE_LOAD_WARN_PER_CPU=0.90
|
|
# CLUSTER_HEALTH_NODE_LOAD_CRIT_PER_CPU=1.20
|
|
# CLUSTER_HEALTH_NODE_MEM_WARN_PCT=85
|
|
# CLUSTER_HEALTH_NODE_MEM_CRIT_PCT=92
|
|
# CLUSTER_HEALTH_VZ_WARN_PCT=85
|
|
# CLUSTER_HEALTH_VZ_CRIT_PCT=93
|
|
# CLUSTER_HEALTH_PSI_CPU_SOME_WARN=10
|
|
# CLUSTER_HEALTH_PSI_CPU_SOME_CRIT=20
|
|
# CLUSTER_HEALTH_PSI_IO_FULL_WARN=10
|
|
# CLUSTER_HEALTH_PSI_IO_FULL_CRIT=20
|
|
# CLUSTER_HEALTH_PSI_MEM_FULL_WARN=5
|
|
# CLUSTER_HEALTH_PSI_MEM_FULL_CRIT=10
|
|
# CLUSTER_HEALTH_LXC_MEM_WARN_PCT=85
|
|
# CLUSTER_HEALTH_LXC_MEM_CRIT_PCT=95
|
|
# CLUSTER_HEALTH_LXC_CPU_WARN_PCT=20
|
|
# CLUSTER_HEALTH_LXC_CPU_CRIT_PCT=40
|
|
# CLUSTER_HEALTH_NODE_SKEW_WARN_PCT=45
|
|
# CLUSTER_HEALTH_NODE_SKEW_CRIT_PCT=55
|
|
|
|
set -euo pipefail
|
|
|
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
|
PROJECT_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)"
|
|
# shellcheck source=/dev/null
|
|
source "${PROJECT_ROOT}/scripts/lib/load-project-env.sh"
|
|
|
|
JSON_ONLY=0
|
|
case "${1:-}" in
|
|
--json) JSON_ONLY=1 ;;
|
|
"" ) ;;
|
|
-h|--help)
|
|
sed -n '1,48p' "$0"
|
|
exit 0
|
|
;;
|
|
*)
|
|
echo "ERROR: unknown argument: ${1}" >&2
|
|
exit 2
|
|
;;
|
|
esac
|
|
|
|
PROXMOX_SSH_USER="${PROXMOX_SSH_USER:-root}"
|
|
SEED_HOST="${SEED_HOST:-${PROXMOX_HOST_R630_01:-192.168.11.11}}"
|
|
OUT_DIR="${OUT_DIR:-${PROJECT_ROOT}/reports/status}"
|
|
TS="$(date +%Y%m%d_%H%M%S)"
|
|
JSON_OUT="${JSON_OUT:-${OUT_DIR}/lxc_cluster_health_${TS}.json}"
|
|
TEXT_OUT="${TEXT_OUT:-${OUT_DIR}/lxc_cluster_health_${TS}.txt}"
|
|
mkdir -p "${OUT_DIR}"
|
|
|
|
TMP_DIR="$(mktemp -d)"
|
|
trap 'rm -rf "${TMP_DIR}"' EXIT
|
|
|
|
VM_JSON="${TMP_DIR}/cluster_resources_vm.json"
|
|
NODES_JSON="${TMP_DIR}/cluster_resources_node.json"
|
|
NODE_DIR="${TMP_DIR}/nodes"
|
|
mkdir -p "${NODE_DIR}"
|
|
|
|
ssh_base=(
|
|
ssh
|
|
-o BatchMode=yes
|
|
-o ConnectTimeout=15
|
|
-o StrictHostKeyChecking=no
|
|
)
|
|
|
|
SEED_TARGET="${PROXMOX_SSH_USER}@${SEED_HOST}"
|
|
|
|
node_ssh_host() {
|
|
case "$1" in
|
|
ml110) printf '%s\n' "${PROXMOX_HOST_ML110:-$1}" ;;
|
|
r630-01) printf '%s\n' "${PROXMOX_HOST_R630_01:-$1}" ;;
|
|
r630-02) printf '%s\n' "${PROXMOX_HOST_R630_02:-$1}" ;;
|
|
r630-03) printf '%s\n' "${PROXMOX_HOST_R630_03:-$1}" ;;
|
|
r630-04) printf '%s\n' "${PROXMOX_HOST_R630_04:-$1}" ;;
|
|
*) printf '%s\n' "$1" ;;
|
|
esac
|
|
}
|
|
|
|
if ! ping -c1 -W2 "${SEED_HOST}" >/dev/null 2>&1; then
|
|
echo "ERROR: seed unreachable: ${SEED_HOST}" >&2
|
|
exit 3
|
|
fi
|
|
|
|
if ! "${ssh_base[@]}" "${SEED_TARGET}" "pvesh get /cluster/resources --type vm --output-format json" >"${VM_JSON}" 2>"${TMP_DIR}/seed_vm.err"; then
|
|
echo "ERROR: failed to query VM resources from ${SEED_HOST}" >&2
|
|
cat "${TMP_DIR}/seed_vm.err" >&2 || true
|
|
exit 3
|
|
fi
|
|
|
|
if ! "${ssh_base[@]}" "${SEED_TARGET}" "pvesh get /cluster/resources --type node --output-format json" >"${NODES_JSON}" 2>"${TMP_DIR}/seed_nodes.err"; then
|
|
echo "ERROR: failed to query node resources from ${SEED_HOST}" >&2
|
|
cat "${TMP_DIR}/seed_nodes.err" >&2 || true
|
|
exit 3
|
|
fi
|
|
|
|
mapfile -t NODE_ROWS < <(
|
|
python3 - "${NODES_JSON}" <<'PY'
|
|
import json, sys
|
|
with open(sys.argv[1], 'r', encoding='utf-8') as fh:
|
|
data = json.load(fh)
|
|
for row in data:
|
|
node = row.get("node")
|
|
if node:
|
|
print(node)
|
|
PY
|
|
)
|
|
|
|
if [[ "${#NODE_ROWS[@]}" -eq 0 ]]; then
|
|
echo "ERROR: no Proxmox nodes returned by cluster resources" >&2
|
|
exit 3
|
|
fi
|
|
|
|
REMOTE_BODY=$(cat <<'EOS'
|
|
set -euo pipefail
|
|
echo "__HOSTNAME__"
|
|
hostname -s 2>/dev/null || hostname
|
|
echo "__UPTIME__"
|
|
uptime
|
|
echo "__NPROC__"
|
|
nproc 2>/dev/null || getconf _NPROCESSORS_ONLN || echo 0
|
|
echo "__FREE__"
|
|
free -b
|
|
echo "__PSI_CPU__"
|
|
cat /proc/pressure/cpu 2>/dev/null || true
|
|
echo "__PSI_IO__"
|
|
cat /proc/pressure/io 2>/dev/null || true
|
|
echo "__PSI_MEMORY__"
|
|
cat /proc/pressure/memory 2>/dev/null || true
|
|
echo "__DF_VZ__"
|
|
df -B1 -P /var/lib/vz 2>/dev/null || true
|
|
echo "__PVESM__"
|
|
pvesm status 2>/dev/null || true
|
|
EOS
|
|
)
|
|
|
|
for node in "${NODE_ROWS[@]}"; do
|
|
target="${PROXMOX_SSH_USER}@$(node_ssh_host "${node}")"
|
|
if ! "${ssh_base[@]}" "${target}" "bash -lc $(printf '%q' "${REMOTE_BODY}")" >"${NODE_DIR}/${node}.txt" 2>"${NODE_DIR}/${node}.err"; then
|
|
printf 'COLLECTION_FAILED\n' >"${NODE_DIR}/${node}.txt"
|
|
cat "${NODE_DIR}/${node}.err" >>"${NODE_DIR}/${node}.txt" || true
|
|
fi
|
|
done
|
|
|
|
set +e
|
|
python3 - "${VM_JSON}" "${NODES_JSON}" "${NODE_DIR}" "${JSON_OUT}" "${TEXT_OUT}" "${SEED_HOST}" <<'PY'
|
|
import json
|
|
import math
|
|
import os
|
|
import re
|
|
import sys
|
|
from collections import Counter, defaultdict
|
|
from datetime import datetime, timezone
|
|
|
|
vm_json, nodes_json, node_dir, json_out, text_out, seed_host = sys.argv[1:7]
|
|
|
|
def env_float(name, default):
|
|
try:
|
|
return float(os.environ.get(name, default))
|
|
except Exception:
|
|
return float(default)
|
|
|
|
def env_int(name, default):
|
|
try:
|
|
return int(float(os.environ.get(name, default)))
|
|
except Exception:
|
|
return int(default)
|
|
|
|
T = {
|
|
"node_load_warn_per_cpu": env_float("CLUSTER_HEALTH_NODE_LOAD_WARN_PER_CPU", 0.90),
|
|
"node_load_crit_per_cpu": env_float("CLUSTER_HEALTH_NODE_LOAD_CRIT_PER_CPU", 1.20),
|
|
"node_mem_warn_pct": env_float("CLUSTER_HEALTH_NODE_MEM_WARN_PCT", 85),
|
|
"node_mem_crit_pct": env_float("CLUSTER_HEALTH_NODE_MEM_CRIT_PCT", 92),
|
|
"vz_warn_pct": env_float("CLUSTER_HEALTH_VZ_WARN_PCT", 85),
|
|
"vz_crit_pct": env_float("CLUSTER_HEALTH_VZ_CRIT_PCT", 93),
|
|
"psi_cpu_some_warn": env_float("CLUSTER_HEALTH_PSI_CPU_SOME_WARN", 10),
|
|
"psi_cpu_some_crit": env_float("CLUSTER_HEALTH_PSI_CPU_SOME_CRIT", 20),
|
|
"psi_io_full_warn": env_float("CLUSTER_HEALTH_PSI_IO_FULL_WARN", 10),
|
|
"psi_io_full_crit": env_float("CLUSTER_HEALTH_PSI_IO_FULL_CRIT", 20),
|
|
"psi_mem_full_warn": env_float("CLUSTER_HEALTH_PSI_MEM_FULL_WARN", 5),
|
|
"psi_mem_full_crit": env_float("CLUSTER_HEALTH_PSI_MEM_FULL_CRIT", 10),
|
|
"lxc_mem_warn_pct": env_float("CLUSTER_HEALTH_LXC_MEM_WARN_PCT", 85),
|
|
"lxc_mem_crit_pct": env_float("CLUSTER_HEALTH_LXC_MEM_CRIT_PCT", 95),
|
|
"lxc_cpu_warn_pct": env_float("CLUSTER_HEALTH_LXC_CPU_WARN_PCT", 20),
|
|
"lxc_cpu_crit_pct": env_float("CLUSTER_HEALTH_LXC_CPU_CRIT_PCT", 40),
|
|
"node_skew_warn_pct": env_float("CLUSTER_HEALTH_NODE_SKEW_WARN_PCT", 45),
|
|
"node_skew_crit_pct": env_float("CLUSTER_HEALTH_NODE_SKEW_CRIT_PCT", 55),
|
|
"summary_top_n": env_int("CLUSTER_HEALTH_SUMMARY_TOP_N", 8),
|
|
}
|
|
|
|
with open(vm_json, "r", encoding="utf-8") as fh:
|
|
vm_rows = json.load(fh)
|
|
with open(nodes_json, "r", encoding="utf-8") as fh:
|
|
node_rows = json.load(fh)
|
|
|
|
def parse_uptime_load(text):
|
|
m = re.search(r"load average[s]?:\s*([0-9.]+),\s*([0-9.]+),\s*([0-9.]+)", text)
|
|
if not m:
|
|
return None
|
|
return [float(m.group(1)), float(m.group(2)), float(m.group(3))]
|
|
|
|
def parse_free(text):
|
|
for line in text.splitlines():
|
|
if line.startswith("Mem:"):
|
|
parts = line.split()
|
|
if len(parts) >= 3:
|
|
total = int(parts[1])
|
|
used = int(parts[2])
|
|
return {"total": total, "used": used}
|
|
return None
|
|
|
|
def parse_psi(section):
|
|
out = {}
|
|
for line in section.splitlines():
|
|
line = line.strip()
|
|
if not line:
|
|
continue
|
|
kind, rest = line.split(None, 1)
|
|
vals = {}
|
|
for token in rest.split():
|
|
key, value = token.split("=", 1)
|
|
try:
|
|
vals[key] = float(value)
|
|
except ValueError:
|
|
pass
|
|
out[kind] = vals
|
|
return out
|
|
|
|
def parse_df_vz(section):
|
|
lines = [line for line in section.splitlines() if line.strip()]
|
|
if len(lines) < 2:
|
|
return None
|
|
parts = lines[-1].split()
|
|
if len(parts) < 6:
|
|
return None
|
|
return {
|
|
"filesystem": parts[0],
|
|
"size_bytes": int(parts[1]),
|
|
"used_bytes": int(parts[2]),
|
|
"avail_bytes": int(parts[3]),
|
|
"use_pct": float(parts[4].rstrip("%")),
|
|
"mountpoint": parts[5],
|
|
}
|
|
|
|
def parse_pvesm(section):
|
|
lines = [line.rstrip() for line in section.splitlines() if line.strip()]
|
|
if len(lines) < 2:
|
|
return []
|
|
storage = []
|
|
for line in lines[1:]:
|
|
parts = line.split()
|
|
if len(parts) < 6:
|
|
continue
|
|
item = {
|
|
"name": parts[0],
|
|
"type": parts[1],
|
|
"status": parts[2],
|
|
"total_bytes": None if parts[3] == "0" else int(parts[3]),
|
|
"used_bytes": None if parts[4] == "0" else int(parts[4]),
|
|
"available_bytes": None if parts[5] == "0" else int(parts[5]),
|
|
"use_pct": None,
|
|
}
|
|
if item["total_bytes"] and item["used_bytes"] is not None:
|
|
item["use_pct"] = round((item["used_bytes"] / item["total_bytes"]) * 100, 2)
|
|
storage.append(item)
|
|
return storage
|
|
|
|
def split_sections(text):
|
|
sections = {}
|
|
current = None
|
|
bucket = []
|
|
for raw in text.splitlines():
|
|
line = raw.rstrip("\n")
|
|
if line.startswith("__") and line.endswith("__"):
|
|
if current is not None:
|
|
sections[current] = "\n".join(bucket).strip()
|
|
current = line.strip("_")
|
|
bucket = []
|
|
continue
|
|
bucket.append(line)
|
|
if current is not None:
|
|
sections[current] = "\n".join(bucket).strip()
|
|
return sections
|
|
|
|
severity_rank = {"ok": 0, "warn": 1, "crit": 2}
|
|
alerts = []
|
|
|
|
def add_alert(severity, scope, entity, metric, message, value=None, threshold=None):
|
|
alerts.append({
|
|
"severity": severity,
|
|
"scope": scope,
|
|
"entity": entity,
|
|
"metric": metric,
|
|
"message": message,
|
|
"value": value,
|
|
"threshold": threshold,
|
|
})
|
|
|
|
node_metrics = {}
|
|
for path in sorted(os.listdir(node_dir)):
|
|
if not path.endswith(".txt"):
|
|
continue
|
|
node_name = path[:-4]
|
|
full_path = os.path.join(node_dir, path)
|
|
text = open(full_path, "r", encoding="utf-8", errors="replace").read()
|
|
if text.startswith("COLLECTION_FAILED"):
|
|
node_metrics[node_name] = {
|
|
"node": node_name,
|
|
"collection_failed": True,
|
|
"error": text.splitlines()[1:] if len(text.splitlines()) > 1 else [],
|
|
}
|
|
add_alert("crit", "node", node_name, "collection", f"{node_name} metrics collection failed")
|
|
continue
|
|
|
|
sections = split_sections(text)
|
|
loads = parse_uptime_load(sections.get("UPTIME", ""))
|
|
nproc = None
|
|
try:
|
|
nproc = int((sections.get("NPROC", "0").splitlines() or ["0"])[0].strip())
|
|
except ValueError:
|
|
nproc = 0
|
|
free_mem = parse_free(sections.get("FREE", ""))
|
|
psi = {
|
|
"cpu": parse_psi(sections.get("PSI_CPU", "")),
|
|
"io": parse_psi(sections.get("PSI_IO", "")),
|
|
"memory": parse_psi(sections.get("PSI_MEMORY", "")),
|
|
}
|
|
df_vz = parse_df_vz(sections.get("DF_VZ", ""))
|
|
pvesm = parse_pvesm(sections.get("PVESM", ""))
|
|
|
|
metric = {
|
|
"node": node_name,
|
|
"hostname": (sections.get("HOSTNAME", node_name).splitlines() or [node_name])[0].strip(),
|
|
"collection_failed": False,
|
|
"loadavg": loads,
|
|
"nproc": nproc,
|
|
"load_per_cpu_1m": round(loads[0] / nproc, 3) if loads and nproc else None,
|
|
"memory": None,
|
|
"psi": psi,
|
|
"df_vz": df_vz,
|
|
"storage": pvesm,
|
|
}
|
|
if free_mem and free_mem["total"] > 0:
|
|
metric["memory"] = {
|
|
**free_mem,
|
|
"used_pct": round((free_mem["used"] / free_mem["total"]) * 100, 2),
|
|
}
|
|
node_metrics[node_name] = metric
|
|
|
|
lp = metric["load_per_cpu_1m"]
|
|
if lp is not None:
|
|
if lp >= T["node_load_crit_per_cpu"]:
|
|
add_alert("crit", "node", node_name, "load_per_cpu_1m", f"{node_name} load/core is high", lp, T["node_load_crit_per_cpu"])
|
|
elif lp >= T["node_load_warn_per_cpu"]:
|
|
add_alert("warn", "node", node_name, "load_per_cpu_1m", f"{node_name} load/core is elevated", lp, T["node_load_warn_per_cpu"])
|
|
|
|
mem = metric["memory"]
|
|
if mem:
|
|
if mem["used_pct"] >= T["node_mem_crit_pct"]:
|
|
add_alert("crit", "node", node_name, "memory_used_pct", f"{node_name} host RAM usage is high", mem["used_pct"], T["node_mem_crit_pct"])
|
|
elif mem["used_pct"] >= T["node_mem_warn_pct"]:
|
|
add_alert("warn", "node", node_name, "memory_used_pct", f"{node_name} host RAM usage is elevated", mem["used_pct"], T["node_mem_warn_pct"])
|
|
|
|
if df_vz:
|
|
if df_vz["use_pct"] >= T["vz_crit_pct"]:
|
|
add_alert("crit", "node", node_name, "vz_use_pct", f"{node_name} /var/lib/vz is near full", df_vz["use_pct"], T["vz_crit_pct"])
|
|
elif df_vz["use_pct"] >= T["vz_warn_pct"]:
|
|
add_alert("warn", "node", node_name, "vz_use_pct", f"{node_name} /var/lib/vz usage is elevated", df_vz["use_pct"], T["vz_warn_pct"])
|
|
|
|
cpu_some = psi["cpu"].get("some", {}).get("avg10")
|
|
if cpu_some is not None:
|
|
if cpu_some >= T["psi_cpu_some_crit"]:
|
|
add_alert("crit", "node", node_name, "psi_cpu_some_avg10", f"{node_name} CPU pressure is high", cpu_some, T["psi_cpu_some_crit"])
|
|
elif cpu_some >= T["psi_cpu_some_warn"]:
|
|
add_alert("warn", "node", node_name, "psi_cpu_some_avg10", f"{node_name} CPU pressure is elevated", cpu_some, T["psi_cpu_some_warn"])
|
|
|
|
io_full = psi["io"].get("full", {}).get("avg10")
|
|
if io_full is not None:
|
|
if io_full >= T["psi_io_full_crit"]:
|
|
add_alert("crit", "node", node_name, "psi_io_full_avg10", f"{node_name} I/O full pressure is high", io_full, T["psi_io_full_crit"])
|
|
elif io_full >= T["psi_io_full_warn"]:
|
|
add_alert("warn", "node", node_name, "psi_io_full_avg10", f"{node_name} I/O full pressure is elevated", io_full, T["psi_io_full_warn"])
|
|
|
|
mem_full = psi["memory"].get("full", {}).get("avg10")
|
|
if mem_full is not None:
|
|
if mem_full >= T["psi_mem_full_crit"]:
|
|
add_alert("crit", "node", node_name, "psi_memory_full_avg10", f"{node_name} memory full pressure is high", mem_full, T["psi_mem_full_crit"])
|
|
elif mem_full >= T["psi_mem_full_warn"]:
|
|
add_alert("warn", "node", node_name, "psi_memory_full_avg10", f"{node_name} memory full pressure is elevated", mem_full, T["psi_mem_full_warn"])
|
|
|
|
lxc_rows = [row for row in vm_rows if row.get("type") == "lxc"]
|
|
running_lxcs = [row for row in lxc_rows if row.get("status") == "running"]
|
|
stopped_lxcs = [row for row in lxc_rows if row.get("status") != "running"]
|
|
|
|
node_counts = Counter(row.get("node", "unknown") for row in running_lxcs)
|
|
running_total = len(running_lxcs)
|
|
for node, count in node_counts.items():
|
|
pct = round((count / running_total) * 100, 2) if running_total else 0.0
|
|
if pct >= T["node_skew_crit_pct"]:
|
|
add_alert("crit", "cluster", node, "running_lxc_share_pct", f"{node} holds a large share of running LXCs", pct, T["node_skew_crit_pct"])
|
|
elif pct >= T["node_skew_warn_pct"]:
|
|
add_alert("warn", "cluster", node, "running_lxc_share_pct", f"{node} holds a high share of running LXCs", pct, T["node_skew_warn_pct"])
|
|
|
|
mem_hot = []
|
|
cpu_hot = []
|
|
disk_rw = []
|
|
network_totals = []
|
|
all_lxcs = []
|
|
|
|
for row in running_lxcs:
|
|
maxmem = row.get("maxmem") or 0
|
|
mem = row.get("mem") or 0
|
|
mem_pct = round((mem / maxmem) * 100, 2) if maxmem else None
|
|
maxcpu = row.get("maxcpu") or 0
|
|
cpu_pct = round(float(row.get("cpu") or 0) * 100, 2)
|
|
diskread = int(row.get("diskread") or 0)
|
|
diskwrite = int(row.get("diskwrite") or 0)
|
|
netin = int(row.get("netin") or 0)
|
|
netout = int(row.get("netout") or 0)
|
|
entry = {
|
|
"vmid": row.get("vmid"),
|
|
"name": row.get("name"),
|
|
"node": row.get("node"),
|
|
"cpu_pct": cpu_pct,
|
|
"maxcpu": maxcpu,
|
|
"mem_pct": mem_pct,
|
|
"mem_bytes": mem,
|
|
"maxmem_bytes": maxmem,
|
|
"disk_pct": round(((row.get("disk") or 0) / row.get("maxdisk")) * 100, 2) if row.get("maxdisk") else None,
|
|
"disk_bytes": int(row.get("disk") or 0),
|
|
"maxdisk_bytes": int(row.get("maxdisk") or 0),
|
|
"diskread_bytes": diskread,
|
|
"diskwrite_bytes": diskwrite,
|
|
"netin_bytes": netin,
|
|
"netout_bytes": netout,
|
|
"status": row.get("status"),
|
|
}
|
|
all_lxcs.append(entry)
|
|
if mem_pct is not None:
|
|
mem_hot.append(entry)
|
|
if mem_pct >= T["lxc_mem_crit_pct"]:
|
|
add_alert("crit", "lxc", f"{row.get('vmid')}:{row.get('name')}", "memory_used_pct", "LXC memory usage is high", mem_pct, T["lxc_mem_crit_pct"])
|
|
elif mem_pct >= T["lxc_mem_warn_pct"]:
|
|
add_alert("warn", "lxc", f"{row.get('vmid')}:{row.get('name')}", "memory_used_pct", "LXC memory usage is elevated", mem_pct, T["lxc_mem_warn_pct"])
|
|
if cpu_pct >= T["lxc_cpu_crit_pct"]:
|
|
add_alert("crit", "lxc", f"{row.get('vmid')}:{row.get('name')}", "cpu_pct", "LXC CPU usage is high", cpu_pct, T["lxc_cpu_crit_pct"])
|
|
elif cpu_pct >= T["lxc_cpu_warn_pct"]:
|
|
add_alert("warn", "lxc", f"{row.get('vmid')}:{row.get('name')}", "cpu_pct", "LXC CPU usage is elevated", cpu_pct, T["lxc_cpu_warn_pct"])
|
|
cpu_hot.append(entry)
|
|
disk_rw.append({**entry, "disk_total_bytes": diskread + diskwrite})
|
|
network_totals.append({**entry, "network_total_bytes": netin + netout})
|
|
|
|
mem_hot.sort(key=lambda x: (-1 if x["mem_pct"] is None else -x["mem_pct"], x["vmid"]))
|
|
cpu_hot.sort(key=lambda x: (-x["cpu_pct"], x["vmid"]))
|
|
disk_rw.sort(key=lambda x: (-x["disk_total_bytes"], x["vmid"]))
|
|
network_totals.sort(key=lambda x: (-x["network_total_bytes"], x["vmid"]))
|
|
|
|
alerts.sort(key=lambda a: (-severity_rank[a["severity"]], a["scope"], str(a["entity"]), a["metric"]))
|
|
|
|
overall = "ok"
|
|
if any(a["severity"] == "crit" for a in alerts):
|
|
overall = "crit"
|
|
elif any(a["severity"] == "warn" for a in alerts):
|
|
overall = "warn"
|
|
|
|
node_summary = []
|
|
for row in node_rows:
|
|
node_name = row.get("node")
|
|
metric = node_metrics.get(node_name, {"collection_failed": True})
|
|
node_summary.append({
|
|
"node": node_name,
|
|
"status": row.get("status"),
|
|
"running_lxcs": node_counts.get(node_name, 0),
|
|
"cluster_cpu_fraction_pct": round(float(row.get("cpu") or 0) * 100, 2) if row.get("cpu") is not None else None,
|
|
"cluster_mem_fraction_pct": round(((row.get("mem") or 0) / row.get("maxmem")) * 100, 2) if row.get("maxmem") else None,
|
|
"loadavg_1m": metric.get("loadavg", [None])[0] if not metric.get("collection_failed") else None,
|
|
"load_per_cpu_1m": metric.get("load_per_cpu_1m"),
|
|
"host_mem_used_pct": metric.get("memory", {}).get("used_pct") if not metric.get("collection_failed") else None,
|
|
"psi_cpu_some_avg10": metric.get("psi", {}).get("cpu", {}).get("some", {}).get("avg10") if not metric.get("collection_failed") else None,
|
|
"psi_io_full_avg10": metric.get("psi", {}).get("io", {}).get("full", {}).get("avg10") if not metric.get("collection_failed") else None,
|
|
"psi_memory_full_avg10": metric.get("psi", {}).get("memory", {}).get("full", {}).get("avg10") if not metric.get("collection_failed") else None,
|
|
"vz_use_pct": metric.get("df_vz", {}).get("use_pct") if not metric.get("collection_failed") else None,
|
|
})
|
|
|
|
report = {
|
|
"collected_at": datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ"),
|
|
"seed_host": seed_host,
|
|
"overall_status": overall,
|
|
"thresholds": T,
|
|
"cluster": {
|
|
"total_lxcs": len(lxc_rows),
|
|
"running_lxcs": running_total,
|
|
"stopped_lxcs": len(stopped_lxcs),
|
|
"running_distribution": [
|
|
{
|
|
"node": node,
|
|
"running_lxcs": count,
|
|
"share_pct": round((count / running_total) * 100, 2) if running_total else 0.0,
|
|
}
|
|
for node, count in sorted(node_counts.items())
|
|
],
|
|
},
|
|
"nodes": node_summary,
|
|
"node_metrics": node_metrics,
|
|
"top_lxcs": {
|
|
"memory_pct": mem_hot[:T["summary_top_n"]],
|
|
"cpu_pct": cpu_hot[:T["summary_top_n"]],
|
|
"disk_total_bytes": disk_rw[:T["summary_top_n"]],
|
|
"network_total_bytes": network_totals[:T["summary_top_n"]],
|
|
},
|
|
"lxcs": sorted(all_lxcs, key=lambda x: (str(x["node"]), int(x["vmid"]))),
|
|
"alerts": alerts,
|
|
}
|
|
|
|
with open(json_out, "w", encoding="utf-8") as fh:
|
|
json.dump(report, fh, indent=2)
|
|
fh.write("\n")
|
|
|
|
def gib(n):
|
|
return round(n / (1024 ** 3), 2)
|
|
|
|
lines = []
|
|
lines.append(f"LXC cluster health {report['collected_at']} ({overall.upper()})")
|
|
lines.append(f"Seed host: {report['seed_host']}")
|
|
lines.append(f"LXCs: running {running_total} / total {len(lxc_rows)} / stopped {len(stopped_lxcs)}")
|
|
lines.append("")
|
|
lines.append("Node summary:")
|
|
for item in node_summary:
|
|
lines.append(
|
|
f"- {item['node']}: running_lxcs={item['running_lxcs']}, "
|
|
f"load1={item['loadavg_1m'] if item['loadavg_1m'] is not None else 'n/a'}, "
|
|
f"load/core={item['load_per_cpu_1m'] if item['load_per_cpu_1m'] is not None else 'n/a'}, "
|
|
f"host_mem={item['host_mem_used_pct'] if item['host_mem_used_pct'] is not None else 'n/a'}%, "
|
|
f"psi_io_full_avg10={item['psi_io_full_avg10'] if item['psi_io_full_avg10'] is not None else 'n/a'}, "
|
|
f"vz={item['vz_use_pct'] if item['vz_use_pct'] is not None else 'n/a'}%"
|
|
)
|
|
lines.append("")
|
|
lines.append("Top findings:")
|
|
if alerts:
|
|
for alert in alerts[: max(T["summary_top_n"], 10)]:
|
|
value = "" if alert["value"] is None else f" value={alert['value']}"
|
|
threshold = "" if alert["threshold"] is None else f" threshold={alert['threshold']}"
|
|
lines.append(f"- [{alert['severity'].upper()}] {alert['scope']} {alert['entity']} {alert['metric']}: {alert['message']}{value}{threshold}")
|
|
else:
|
|
lines.append("- none")
|
|
|
|
lines.append("")
|
|
lines.append("Top LXC memory:")
|
|
for item in mem_hot[: T["summary_top_n"]]:
|
|
lines.append(f"- {item['vmid']} {item['name']} @ {item['node']}: mem={item['mem_pct']}% of {gib(item['maxmem_bytes'])} GiB, cpu={item['cpu_pct']}%")
|
|
|
|
lines.append("")
|
|
lines.append("Top LXC CPU:")
|
|
for item in cpu_hot[: T["summary_top_n"]]:
|
|
lines.append(f"- {item['vmid']} {item['name']} @ {item['node']}: cpu={item['cpu_pct']}%, mem={item['mem_pct']}%")
|
|
|
|
lines.append("")
|
|
lines.append("Top cumulative disk:")
|
|
for item in disk_rw[: T["summary_top_n"]]:
|
|
lines.append(f"- {item['vmid']} {item['name']} @ {item['node']}: read+write={gib(item['disk_total_bytes'])} GiB")
|
|
|
|
lines.append("")
|
|
lines.append("Top cumulative network:")
|
|
for item in network_totals[: T["summary_top_n"]]:
|
|
lines.append(f"- {item['vmid']} {item['name']} @ {item['node']}: in+out={gib(item['network_total_bytes'])} GiB")
|
|
|
|
with open(text_out, "w", encoding="utf-8") as fh:
|
|
fh.write("\n".join(lines).rstrip() + "\n")
|
|
|
|
print(json.dumps({
|
|
"overall_status": overall,
|
|
"json_report": json_out,
|
|
"text_report": text_out,
|
|
"alerts": len(alerts),
|
|
}))
|
|
|
|
if overall == "crit":
|
|
sys.exit(2)
|
|
if overall == "warn":
|
|
sys.exit(1)
|
|
sys.exit(0)
|
|
PY
|
|
|
|
RC=$?
|
|
set -e
|
|
|
|
if [[ "${JSON_ONLY}" -eq 1 ]]; then
|
|
cat "${JSON_OUT}"
|
|
else
|
|
cat "${TEXT_OUT}"
|
|
echo
|
|
echo "JSON: ${JSON_OUT}"
|
|
echo "Text: ${TEXT_OUT}"
|
|
fi
|
|
|
|
exit "${RC}"
|