ops: oracle publisher LXC 3500/3501, CT migrate docs, Besu/RPC maintenance
- Provision oracle-publisher on CT 3500 (quoted DATA_SOURCE URLs, dotenv). - Host-side pct-lxc-3501-net-up for ccip-monitor eth0 after migrate. - CoinGecko key script: avoid sed & corruption; document quoted URLs. - Besu node list reload, fstrim/RPC scripts, storage health docs. - Submodule smom-dbis-138: web3 v6 pin, oracle check default host r630-02. Made-with: Cursor
This commit is contained in:
@@ -3,6 +3,10 @@
|
||||
# Usage: ./scripts/maintenance/fstrim-all-running-ct.sh [--dry-run]
|
||||
# Requires: SSH key-based access to ml110, r630-01, r630-02.
|
||||
# See: docs/04-configuration/STORAGE_GROWTH_AND_HEALTH.md
|
||||
#
|
||||
# Environment (optional):
|
||||
# FSTRIM_TIMEOUT_SEC Seconds per CT (default 180). Use 45–60 for faster fleet passes when many CTs hang on FITRIM.
|
||||
# FSTRIM_HOSTS Space-separated host keys: ml110 r630-01 r630-02 (default: all three).
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
@@ -14,10 +18,14 @@ ML110="${PROXMOX_HOST_ML110:-192.168.11.10}"
|
||||
R630_01="${PROXMOX_HOST_R630_01:-192.168.11.11}"
|
||||
R630_02="${PROXMOX_HOST_R630_02:-192.168.11.12}"
|
||||
|
||||
FSTRIM_TIMEOUT_SEC="${FSTRIM_TIMEOUT_SEC:-180}"
|
||||
# shellcheck disable=SC2206
|
||||
FSTRIM_HOSTS_ARR=(${FSTRIM_HOSTS:-ml110 r630-01 r630-02})
|
||||
|
||||
DRY_RUN=0
|
||||
[[ "${1:-}" == "--dry-run" ]] && DRY_RUN=1
|
||||
|
||||
run_ssh() { ssh -o ConnectTimeout=10 -o StrictHostKeyChecking=no root@"$1" "$2" 2>/dev/null || true; }
|
||||
run_ssh() { ssh -o ConnectTimeout=15 -o ServerAliveInterval=10 -o StrictHostKeyChecking=accept-new root@"$1" "$2" 2>/dev/null || true; }
|
||||
|
||||
fstrim_host() {
|
||||
local host_ip="$1" host_name="$2"
|
||||
@@ -29,21 +37,30 @@ fstrim_host() {
|
||||
fi
|
||||
for vmid in $vmids; do
|
||||
if [[ $DRY_RUN -eq 1 ]]; then
|
||||
echo " [dry-run] $host_name VMID $vmid: would run fstrim -v /"
|
||||
echo " [dry-run] $host_name VMID $vmid: would run fstrim -v / (timeout ${FSTRIM_TIMEOUT_SEC}s)"
|
||||
else
|
||||
out=$(run_ssh "$host_ip" "pct exec $vmid -- fstrim -v / 2>&1" || true)
|
||||
# timeout: some CTs hang on FITRIM or slow storage; do not block entire fleet
|
||||
out=$(run_ssh "$host_ip" "timeout \"${FSTRIM_TIMEOUT_SEC}\" pct exec $vmid -- fstrim -v / 2>&1" || true)
|
||||
echo " $host_name VMID $vmid: ${out:-done}"
|
||||
fi
|
||||
done
|
||||
}
|
||||
|
||||
echo "=== fstrim all running CTs (reclaim thin pool space) ==="
|
||||
echo " timeout_per_ct=${FSTRIM_TIMEOUT_SEC}s hosts=${FSTRIM_HOSTS_ARR[*]}"
|
||||
[[ $DRY_RUN -eq 1 ]] && echo "(dry-run: no changes)"
|
||||
echo ""
|
||||
|
||||
fstrim_host "$ML110" "ml110"
|
||||
fstrim_host "$R630_01" "r630-01"
|
||||
fstrim_host "$R630_02" "r630-02"
|
||||
for key in "${FSTRIM_HOSTS_ARR[@]}"; do
|
||||
case "$key" in
|
||||
ml110) fstrim_host "$ML110" "ml110" ;;
|
||||
r630-01) fstrim_host "$R630_01" "r630-01" ;;
|
||||
r630-02) fstrim_host "$R630_02" "r630-02" ;;
|
||||
*)
|
||||
echo " Unknown FSTRIM_HOSTS entry: $key (use ml110, r630-01, r630-02)"
|
||||
;;
|
||||
esac
|
||||
done
|
||||
|
||||
echo ""
|
||||
echo "Done. Schedule weekly via cron or run with daily-weekly-checks weekly."
|
||||
|
||||
@@ -1,8 +1,9 @@
|
||||
#!/usr/bin/env bash
|
||||
# Make RPC VMIDs (2101, 2500-2505) writable by running e2fsck on their rootfs (fixes read-only remount after ext4 errors).
|
||||
# Make Besu CT rootfs writable by running e2fsck on their root LV (fixes read-only / emergency_ro after ext4 errors).
|
||||
# SSHs to the Proxmox host (r630-01), stops each CT, runs e2fsck -f -y on the LV, starts the CT.
|
||||
#
|
||||
# Usage: ./scripts/maintenance/make-rpc-vmids-writable-via-ssh.sh [--dry-run]
|
||||
# Optional: BESU_WRITABLE_VMIDS="1500 1501 1502" to add sentries or other CTs (default: Core RPC 2101 only).
|
||||
# Run from project root. Requires: SSH to r630-01 (root, key-based).
|
||||
# See: docs/00-meta/502_DEEP_DIVE_ROOT_CAUSES_AND_FIXES.md §Read-only CT
|
||||
|
||||
@@ -13,9 +14,14 @@ PROJECT_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)"
|
||||
[[ -f "${PROJECT_ROOT}/config/ip-addresses.conf" ]] && source "${PROJECT_ROOT}/config/ip-addresses.conf" 2>/dev/null || true
|
||||
|
||||
HOST="${PROXMOX_HOST_R630_01:-192.168.11.11}"
|
||||
# RPC VMIDs on r630-01: Core (2101) + Alltra/HYBX (2500-2505)
|
||||
RPC_VMIDS=(2101 2500 2501 2502 2503 2504 2505)
|
||||
SSH_OPTS="-o ConnectTimeout=15 -o StrictHostKeyChecking=accept-new"
|
||||
# Default: Core RPC on r630-01 (2101). 2500-2505 removed — destroyed; see ALL_VMIDS_ENDPOINTS.md.
|
||||
# Add sentries with: BESU_WRITABLE_VMIDS="1500 1501 1502 2101" ./scripts/maintenance/make-rpc-vmids-writable-via-ssh.sh
|
||||
if [[ -n "${BESU_WRITABLE_VMIDS:-}" ]]; then
|
||||
read -r -a RPC_VMIDS <<< "${BESU_WRITABLE_VMIDS}"
|
||||
else
|
||||
RPC_VMIDS=(2101)
|
||||
fi
|
||||
SSH_OPTS="-o ConnectTimeout=20 -o ServerAliveInterval=15 -o StrictHostKeyChecking=accept-new"
|
||||
|
||||
DRY_RUN=false
|
||||
[[ "${1:-}" == "--dry-run" ]] && DRY_RUN=true
|
||||
|
||||
@@ -2,6 +2,14 @@
|
||||
# Migrate one LXC container from r630-01 to r630-02 (backup → copy → restore).
|
||||
# Use to free space on r630-01's thin pool. Run from project root (LAN); needs SSH to both hosts.
|
||||
#
|
||||
# IMPORTANT — unprivileged CTs: vzdump often fails with tar "Permission denied" inside the guest.
|
||||
# Prefer cluster migration via API (maps source storage to target), e.g.:
|
||||
# ssh root@192.168.11.11 "pvesh create /nodes/r630-01/lxc/<VMID>/migrate --target r630-02 --target-storage thin5 --restart 1"
|
||||
# See docs/03-deployment/MIGRATE_CT_R630_01_TO_R630_02.md
|
||||
#
|
||||
# NEVER run `pct set <vmid> --delete unused0` if unused0 and rootfs reference the same disk name
|
||||
# on different storages (e.g. local-lvm:vm-N-disk-0 vs thin1:vm-N-disk-0) — Proxmox may remove the only root LV.
|
||||
#
|
||||
# Usage:
|
||||
# ./scripts/maintenance/migrate-ct-r630-01-to-r630-02.sh <VMID> [target_storage]
|
||||
# ./scripts/maintenance/migrate-ct-r630-01-to-r630-02.sh 5200 thin1
|
||||
|
||||
22
scripts/maintenance/pct-lxc-3501-net-up.sh
Executable file
22
scripts/maintenance/pct-lxc-3501-net-up.sh
Executable file
@@ -0,0 +1,22 @@
|
||||
#!/usr/bin/env bash
|
||||
# Bring up static networking inside unprivileged LXC 3501 (ccip-monitor) when eth0 stays DOWN.
|
||||
# Run on the Proxmox node that hosts VMID 3501 (r630-02). Optional: @reboot cron.
|
||||
#
|
||||
# Usage (on r630-02 as root): /usr/local/sbin/pct-lxc-3501-net-up.sh
|
||||
# Install: scp to r630-02 /usr/local/sbin/ && chmod +x
|
||||
|
||||
set -euo pipefail
|
||||
VMID="${CCIP_MONITOR_VMID:-3501}"
|
||||
IP="${CCIP_MONITOR_IP:-192.168.11.28/24}"
|
||||
GW="${CCIP_MONITOR_GW:-192.168.11.1}"
|
||||
BCAST="${CCIP_MONITOR_BCAST:-192.168.11.255}"
|
||||
|
||||
if ! pct status "$VMID" 2>/dev/null | grep -q running; then
|
||||
exit 0
|
||||
fi
|
||||
|
||||
pct exec "$VMID" -- ip link set eth0 up
|
||||
pct exec "$VMID" -- ip addr replace "$IP" dev eth0 broadcast "$BCAST" 2>/dev/null || \
|
||||
pct exec "$VMID" -- ip addr add "$IP" dev eth0 broadcast "$BCAST"
|
||||
pct exec "$VMID" -- ip route replace default via "$GW" dev eth0 2>/dev/null || \
|
||||
pct exec "$VMID" -- ip route add default via "$GW" dev eth0
|
||||
115
scripts/maintenance/proxmox-host-io-optimize-pass.sh
Executable file
115
scripts/maintenance/proxmox-host-io-optimize-pass.sh
Executable file
@@ -0,0 +1,115 @@
|
||||
#!/usr/bin/env bash
|
||||
# Additional pass: diagnose I/O + load on Proxmox nodes, then apply safe host-level optimizations.
|
||||
# - Reports: load, PSI, zpool, pvesm, scrub, vzdump, running CT count
|
||||
# - Applies (idempotent): vm.swappiness on ml110; sysstat; host fstrim where supported
|
||||
#
|
||||
# Usage: ./scripts/maintenance/proxmox-host-io-optimize-pass.sh [--diagnose-only]
|
||||
# Requires: SSH key root@ ml110, r630-01, r630-02 (see config/ip-addresses.conf)
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
PROJECT_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)"
|
||||
# shellcheck source=/dev/null
|
||||
source "${PROJECT_ROOT}/config/ip-addresses.conf" 2>/dev/null || true
|
||||
|
||||
ML="${PROXMOX_ML110:-${PROXMOX_HOST_ML110:-192.168.11.10}}"
|
||||
R1="${PROXMOX_R630_01:-${PROXMOX_HOST_R630_01:-192.168.11.11}}"
|
||||
R2="${PROXMOX_R630_02:-${PROXMOX_HOST_R630_02:-192.168.11.12}}"
|
||||
|
||||
SSH_OPTS=(-o ConnectTimeout=20 -o ServerAliveInterval=15 -o StrictHostKeyChecking=accept-new)
|
||||
DIAG_ONLY=false
|
||||
[[ "${1:-}" == "--diagnose-only" ]] && DIAG_ONLY=true
|
||||
|
||||
remote() { ssh "${SSH_OPTS[@]}" "root@$1" bash -s; }
|
||||
|
||||
echo "=== Proxmox host I/O optimize pass ($(date -Is)) ==="
|
||||
echo " ml110=$ML r630-01=$R1 r630-02=$R2 diagnose-only=$DIAG_ONLY"
|
||||
echo ""
|
||||
|
||||
for H in "$ML" "$R1" "$R2"; do
|
||||
echo "########## DIAGNOSTIC: $H ##########"
|
||||
remote "$H" <<'EOS'
|
||||
set +e
|
||||
hostname
|
||||
uptime
|
||||
echo "--- PSI ---"
|
||||
cat /proc/pressure/cpu 2>/dev/null | head -2
|
||||
cat /proc/pressure/io 2>/dev/null | head -2
|
||||
echo "--- pvesm ---"
|
||||
pvesm status 2>/dev/null | head -25
|
||||
echo "--- running workloads ---"
|
||||
echo -n "LXC running: "; pct list 2>/dev/null | awk 'NR>1 && $2=="running"' | wc -l
|
||||
echo -n "VM running: "; qm list 2>/dev/null | awk 'NR>1 && $3=="running"' | wc -l
|
||||
echo "--- vzdump ---"
|
||||
ps aux 2>/dev/null | grep -E '[v]zdump|[p]bs-|proxmox-backup' | head -5 || echo "(none visible)"
|
||||
echo "--- ZFS ---"
|
||||
zpool status 2>/dev/null | head -20 || echo "no zfs"
|
||||
echo "--- scrub ---"
|
||||
zpool status 2>/dev/null | grep -E 'scan|scrub' || true
|
||||
EOS
|
||||
echo ""
|
||||
done
|
||||
|
||||
if $DIAG_ONLY; then
|
||||
echo "Diagnose-only: done."
|
||||
exit 0
|
||||
fi
|
||||
|
||||
echo "########## OPTIMIZE: ml110 swappiness ##########"
|
||||
remote "$ML" <<'EOS'
|
||||
set -e
|
||||
F=/etc/sysctl.d/99-proxmox-ml110-swappiness.conf
|
||||
if ! grep -q '^vm.swappiness=10$' "$F" 2>/dev/null; then
|
||||
printf '%s\n' '# Prefer RAM over swap when plenty of memory free (operator pass)' 'vm.swappiness=10' > "$F"
|
||||
sysctl -p "$F"
|
||||
echo "Wrote and applied $F"
|
||||
else
|
||||
echo "Already vm.swappiness=10 in $F"
|
||||
sysctl vm.swappiness=10 2>/dev/null || true
|
||||
fi
|
||||
EOS
|
||||
echo ""
|
||||
|
||||
echo "########## OPTIMIZE: sysstat (all hosts) ##########"
|
||||
for H in "$ML" "$R1" "$R2"; do
|
||||
echo "--- $H ---"
|
||||
remote "$H" <<'EOS'
|
||||
set -e
|
||||
export DEBIAN_FRONTEND=noninteractive
|
||||
if command -v sar >/dev/null 2>&1; then
|
||||
echo "sysstat already present"
|
||||
else
|
||||
apt-get update -qq && apt-get install -y -qq sysstat
|
||||
fi
|
||||
sed -i 's/^ENABLED="false"/ENABLED="true"/' /etc/default/sysstat 2>/dev/null || true
|
||||
systemctl enable sysstat 2>/dev/null || true
|
||||
systemctl restart sysstat 2>/dev/null || true
|
||||
echo "sar: $(command -v sar || echo missing)"
|
||||
EOS
|
||||
done
|
||||
echo ""
|
||||
|
||||
echo "########## OPTIMIZE: host fstrim (hypervisor root / and /var/lib/vz if supported) ##########"
|
||||
for H in "$ML" "$R1" "$R2"; do
|
||||
echo "--- $H ---"
|
||||
remote "$H" <<'EOS'
|
||||
set +e
|
||||
for m in / /var/lib/vz; do
|
||||
if mountpoint -q "$m" 2>/dev/null; then
|
||||
out=$(fstrim -v "$m" 2>&1)
|
||||
echo "$m: $out"
|
||||
fi
|
||||
done
|
||||
EOS
|
||||
done
|
||||
echo ""
|
||||
|
||||
echo "########## POST: quick load snapshot ##########"
|
||||
for H in "$ML" "$R1" "$R2"; do
|
||||
echo -n "$H "
|
||||
ssh "${SSH_OPTS[@]}" "root@$H" "cat /proc/loadavg | cut -d' ' -f1-3" 2>/dev/null || echo "unreachable"
|
||||
done
|
||||
|
||||
echo ""
|
||||
echo "Done. Optional: run ./scripts/maintenance/fstrim-all-running-ct.sh during a quiet window (can be I/O heavy)."
|
||||
55
scripts/maintenance/restart-ml110-besu-rpc-staggered.sh
Executable file
55
scripts/maintenance/restart-ml110-besu-rpc-staggered.sh
Executable file
@@ -0,0 +1,55 @@
|
||||
#!/usr/bin/env bash
|
||||
# Staggered restart of Besu RPC services on ML110 (192.168.11.10) only.
|
||||
# Use after fleet restarts or when multiple RPC CTs compete for disk — avoids all nodes stuck in RocksDB open/compact.
|
||||
#
|
||||
# Usage: ./scripts/maintenance/restart-ml110-besu-rpc-staggered.sh [--dry-run]
|
||||
# Env: ML110_WAIT_SEC between restarts (default 75), PROXMOX_HOST_ML110 (default 192.168.11.10)
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
PROJECT_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)"
|
||||
# shellcheck source=/dev/null
|
||||
source "${PROJECT_ROOT}/config/ip-addresses.conf" 2>/dev/null || true
|
||||
|
||||
HOST="${PROXMOX_ML110:-${PROXMOX_HOST_ML110:-192.168.11.10}}"
|
||||
WAIT="${ML110_WAIT_SEC:-75}"
|
||||
SSH_OPTS=(-o ConnectTimeout=25 -o ServerAliveInterval=15 -o StrictHostKeyChecking=accept-new)
|
||||
|
||||
# RPC-only CTs on ML110 (see ALL_VMIDS_ENDPOINTS.md)
|
||||
RPC_VMIDS=(2102 2301 2304 2305 2306 2307 2308 2400 2402 2403)
|
||||
|
||||
DRY_RUN=false
|
||||
[[ "${1:-}" == "--dry-run" ]] && DRY_RUN=true
|
||||
|
||||
echo "=== Staggered besu-rpc restart on $HOST ==="
|
||||
echo " VMIDs: ${RPC_VMIDS[*]}"
|
||||
echo " Wait between: ${WAIT}s dry-run=$DRY_RUN"
|
||||
echo ""
|
||||
|
||||
if ! ssh "${SSH_OPTS[@]}" "root@$HOST" "echo OK" 2>/dev/null; then
|
||||
echo "Cannot SSH to root@$HOST" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
last="${RPC_VMIDS[$(( ${#RPC_VMIDS[@]} - 1 ))]}"
|
||||
for vmid in "${RPC_VMIDS[@]}"; do
|
||||
if $DRY_RUN; then
|
||||
echo "[dry-run] would restart VMID $vmid"
|
||||
else
|
||||
echo "$(date -Is) restarting VMID $vmid ..."
|
||||
if ssh "${SSH_OPTS[@]}" "root@$HOST" "timeout 180 pct exec $vmid -- systemctl restart besu-rpc.service"; then
|
||||
echo " OK"
|
||||
else
|
||||
echo " FAIL (timeout or error)" >&2
|
||||
fi
|
||||
fi
|
||||
if [[ "$vmid" != "$last" ]] && ! $DRY_RUN; then
|
||||
echo " waiting ${WAIT}s ..."
|
||||
sleep "$WAIT"
|
||||
fi
|
||||
done
|
||||
|
||||
echo ""
|
||||
echo "Done. Wait 2–5 minutes for 2402/2403 if RocksDB compaction runs; then:"
|
||||
echo " ./scripts/verify/check-chain138-rpc-health.sh"
|
||||
Reference in New Issue
Block a user