From 61841b8291f6c9bca08c5488d51118a04b1cda1b Mon Sep 17 00:00:00 2001 From: defiQUG Date: Thu, 9 Apr 2026 01:20:00 -0700 Subject: [PATCH] feat(it-ops): live inventory, drift API, Keycloak IT role, portal sync hint - Add scripts/it-ops (Proxmox collector, IPAM drift, export orchestrator) - Add sankofa-it-read-api stub with optional CORS and refresh - Add systemd examples for read API, weekly inventory export, timer - Add live-inventory-drift GitHub workflow (dispatch + weekly) - Add IT controller spec, runbooks, Keycloak ensure-it-admin-role script - Note IT_READ_API env on portal sync completion output Made-with: Cursor --- .github/workflows/live-inventory-drift.yml | 29 ++ ...ankofa-it-inventory-export.service.example | 18 + .../sankofa-it-inventory-export.timer.example | 10 + .../sankofa-it-read-api.service.example | 24 ++ .../SANKOFA_IT_OPERATIONS_CONTROLLER_SPEC.md | 177 +++++++++ ...NKOFA_IT_OPS_KEYCLOAK_PORTAL_NEXT_STEPS.md | 48 +++ .../SANKOFA_IT_OPS_LIVE_INVENTORY_SCRIPTS.md | 368 ++++++++++++++++++ .../keycloak-sankofa-ensure-it-admin-role.sh | 120 ++++++ .../deployment/sync-sankofa-portal-7801.sh | 1 + scripts/it-ops/compute_ipam_drift.py | 203 ++++++++++ .../it-ops/export-live-inventory-and-drift.sh | 51 +++ .../it-ops/lib/collect_inventory_remote.py | 109 ++++++ services/sankofa-it-read-api/README.md | 38 ++ services/sankofa-it-read-api/server.py | 188 +++++++++ 14 files changed, 1384 insertions(+) create mode 100644 .github/workflows/live-inventory-drift.yml create mode 100644 config/systemd/sankofa-it-inventory-export.service.example create mode 100644 config/systemd/sankofa-it-inventory-export.timer.example create mode 100644 config/systemd/sankofa-it-read-api.service.example create mode 100644 docs/02-architecture/SANKOFA_IT_OPERATIONS_CONTROLLER_SPEC.md create mode 100644 docs/03-deployment/SANKOFA_IT_OPS_KEYCLOAK_PORTAL_NEXT_STEPS.md create mode 100644 docs/03-deployment/SANKOFA_IT_OPS_LIVE_INVENTORY_SCRIPTS.md create mode 100755 scripts/deployment/keycloak-sankofa-ensure-it-admin-role.sh create mode 100755 scripts/it-ops/compute_ipam_drift.py create mode 100755 scripts/it-ops/export-live-inventory-and-drift.sh create mode 100755 scripts/it-ops/lib/collect_inventory_remote.py create mode 100644 services/sankofa-it-read-api/README.md create mode 100755 services/sankofa-it-read-api/server.py diff --git a/.github/workflows/live-inventory-drift.yml b/.github/workflows/live-inventory-drift.yml new file mode 100644 index 0000000..5d64534 --- /dev/null +++ b/.github/workflows/live-inventory-drift.yml @@ -0,0 +1,29 @@ +# Live Proxmox guest inventory + drift vs config/ip-addresses.conf. +# GitHub-hosted runners usually cannot reach 192.168.11.x; workflow still produces +# drift.json with seed_unreachable. Use a self-hosted LAN runner or run locally: +# bash scripts/it-ops/export-live-inventory-and-drift.sh +name: Live inventory and IPAM drift + +on: + workflow_dispatch: + schedule: + - cron: '25 6 * * 1' + +jobs: + drift: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + + - name: Export live inventory (LAN optional) + run: bash scripts/it-ops/export-live-inventory-and-drift.sh + continue-on-error: true + + - name: Upload artifacts + uses: actions/upload-artifact@v4 + if: always() + with: + name: live-inventory-drift + path: | + reports/status/live_inventory.json + reports/status/drift.json diff --git a/config/systemd/sankofa-it-inventory-export.service.example b/config/systemd/sankofa-it-inventory-export.service.example new file mode 100644 index 0000000..ae05957 --- /dev/null +++ b/config/systemd/sankofa-it-inventory-export.service.example @@ -0,0 +1,18 @@ +# Weekly (or on-demand) live inventory + drift export on a host that has repo + LAN SSH to Proxmox. +# Pair with sankofa-it-inventory-export.timer.example. +# +# sudo cp config/systemd/sankofa-it-inventory-export.service.example /etc/systemd/system/sankofa-it-inventory-export.service +# sudo cp config/systemd/sankofa-it-inventory-export.timer.example /etc/systemd/system/sankofa-it-inventory-export.timer +# sudo systemctl daemon-reload && sudo systemctl enable --now sankofa-it-inventory-export.timer +# +[Unit] +Description=Export Proxmox live inventory and IPAM drift (proxmox repo) +After=network-online.target +Wants=network-online.target + +[Service] +Type=oneshot +User=root +WorkingDirectory=/opt/proxmox +# Load PROXMOX_HOST / SSH keys as needed; script uses config/ip-addresses.conf + .env when present. +ExecStart=/usr/bin/bash /opt/proxmox/scripts/it-ops/export-live-inventory-and-drift.sh diff --git a/config/systemd/sankofa-it-inventory-export.timer.example b/config/systemd/sankofa-it-inventory-export.timer.example new file mode 100644 index 0000000..1770022 --- /dev/null +++ b/config/systemd/sankofa-it-inventory-export.timer.example @@ -0,0 +1,10 @@ +# Run inventory export weekly (Sunday 03:30 UTC). Adjust OnCalendar for your ops window. +[Unit] +Description=Timer — Proxmox live inventory + drift export + +[Timer] +OnCalendar=Sun *-*-* 03:30:00 +Persistent=true + +[Install] +WantedBy=timers.target diff --git a/config/systemd/sankofa-it-read-api.service.example b/config/systemd/sankofa-it-read-api.service.example new file mode 100644 index 0000000..ce08aa0 --- /dev/null +++ b/config/systemd/sankofa-it-read-api.service.example @@ -0,0 +1,24 @@ +# Example systemd unit — IT inventory read API (Phase 0 stub). +# Copy to /etc/systemd/system/sankofa-it-read-api.service, adjust paths and User=. +# +# sudo cp config/systemd/sankofa-it-read-api.service.example /etc/systemd/system/sankofa-it-read-api.service +# sudo systemctl daemon-reload && sudo systemctl enable --now sankofa-it-read-api +# +[Unit] +Description=Sankofa IT read API (live inventory JSON) +After=network.target + +[Service] +Type=simple +User=root +WorkingDirectory=/opt/proxmox +Environment=IT_READ_API_HOST=127.0.0.1 +Environment=IT_READ_API_PORT=8787 +# Environment=IT_READ_API_KEY=change-me +# Optional browser CORS (prefer portal /api/it/* proxy): Environment=IT_READ_API_CORS_ORIGINS=https://portal.sankofa.nexus +ExecStart=/usr/bin/python3 /opt/proxmox/services/sankofa-it-read-api/server.py +Restart=on-failure +RestartSec=5 + +[Install] +WantedBy=multi-user.target diff --git a/docs/02-architecture/SANKOFA_IT_OPERATIONS_CONTROLLER_SPEC.md b/docs/02-architecture/SANKOFA_IT_OPERATIONS_CONTROLLER_SPEC.md new file mode 100644 index 0000000..f366f4c --- /dev/null +++ b/docs/02-architecture/SANKOFA_IT_OPERATIONS_CONTROLLER_SPEC.md @@ -0,0 +1,177 @@ +# Sankofa IT operations controller — architecture spec + +**Status:** Draft for engineering and IT leadership alignment +**Last updated:** 2026-04-08 (Phase 0 live-first inventory section added) +**Audience:** IT team, platform ops, Sankofa admin product owners + +--- + +## 1. Goals + +You need a single operational program that covers: + +| Capability | Intent | +|------------|--------| +| **IP inventory** | Authoritative list of every LAN/WAN/VIP assignment, owner, service, and lifecycle (no drift between spreadsheets and `config/ip-addresses.conf`). | +| **VLAN design** | Move from today’s **flat VLAN 11** to the **planned segmentation** (validators, RPC, explorer, Sankofa services, tenants) without breaking production. | +| **Port mapping** | Physical: switch port ↔ patch panel ↔ host NIC ↔ logical bond/VLAN. Logical: UDM port forwards ↔ NPM host ↔ upstream CT/VM. | +| **Host efficiency** | Compare **actual** Proxmox capacity (CPU/RAM/storage/network) to workload placement; drive consolidation, spare-node use, and subscription/licensing discipline. | +| **IT admin UI** | **HTML controller** under the **Sankofa admin** surface so the IT team can view/control interfaces, assign **licenses/entitlements**, run **provisioning** workflows, and support **billing** (quotes, usage, invoices handoff). | + +This document defines **how** that fits your existing stack (Proxmox cluster, UDM Pro, UniFi, NPMplus, Keycloak, Phoenix/dbis_core marketplace) and a **phased** path so you do not boil the ocean. + +--- + +## 2. Current state (facts from this repo) + +- **IP truth is split** across `config/ip-addresses.conf`, `docs/04-configuration/ALL_VMIDS_ENDPOINTS.md`, and `docs/11-references/NETWORK_CONFIGURATION_MASTER.md`. Automated snapshots: `scripts/verify/poll-proxmox-cluster-hardware.sh`, `reports/status/hardware_and_connected_inventory_*.md`. +- **VLANs:** Production today is **VLAN 11 only** for `192.168.11.0/24`. **Planned** VLANs (110–112, 120, 160, 200–203) are documented in [NETWORK_CONFIGURATION_MASTER.md](../11-references/NETWORK_CONFIGURATION_MASTER.md) but **not** implemented as separate broadcast domains on the wire. +- **Sankofa admin:** `admin.sankofa.nexus` is **client SSO administration** today (same upstream as the portal unless split). See [FQDN_EXPECTED_CONTENT.md](EXPECTED_WEB_CONTENT.md), [ALL_VMIDS_ENDPOINTS.md](../04-configuration/ALL_VMIDS_ENDPOINTS.md) (VMID **7801**). Portal source: sibling repo **`Sankofa/portal`** (`scripts/deployment/sync-sankofa-portal-7801.sh`). +- **Marketplace / commercial:** Partner IRU flows live in **`dbis_core`** (API + React); native infra is mostly **docs + Proxmox**, not one database. See [SANKOFA_MARKETPLACE_SURFACES.md](../03-deployment/SANKOFA_MARKETPLACE_SURFACES.md). + +**Gap:** There is **no** single product today that unifies IPAM, switch port data, Proxmox actions, UniFi, and billing under Sankofa admin. This spec is the blueprint to add it. + +--- + +## 3. Target architecture (recommended) + +### 3.1 UI placement + +| Option | Pros | Cons | +|--------|------|------| +| **A — New `/it` (or `/ops`) app route** inside **`Sankofa/portal`**, gated by Keycloak group `sankofa-it-admin` | One TLS hostname, shared session patterns, fastest path for “under admin” | Portal bundle grows; must isolate client admin vs IT super-admin | +| **B — Dedicated host** `it.sankofa.nexus` → small **Next.js/Vite** SPA + BFF | Strong separation, independent deploy cadence | Extra NPM row, cert, pipeline | +| **C — Embed Grafana + NetBox** only | Quick graphs / IPAM | Weak billing/licensing story; less “Sankofa branded” control | + +**Recommendation:** **Option A** for MVP (fastest), with **API on a dedicated backend** so you can later move the shell to **B** without rewriting integrations. + +### 3.2 Backend (“control plane API”) + +Introduce a **small service** (name e.g. `sankofa-it-api`) **not** on the public internet without auth: + +- **Network:** VLAN 11 only or **private listener** + NPM **internal** host; OIDC **client credentials** or **user JWT** from Keycloak. +- **Responsibilities:** + - **Read models:** IPAM, devices, port maps, Proxmox inventory snapshot, UniFi device list (cached). + - **Write models:** change requests with **audit log** (who/when/what); optional **approval** queue for destructive actions. + - **Connectors (adapters):** Proxmox API, UniFi Network API (UDM), NPM API (already scripted in repo), optional NetBox later. +- **Do not** put Proxmox root tokens in the browser; **BFF** holds secrets server-side. + +### 3.3 Data model (minimum viable) + +| Entity | Fields (illustrative) | +|--------|------------------------| +| **Subnet / VLAN** | id, vlan_id, cidr, name, environment, routing notes | +| **IP assignment** | address, hostname, vmid?, mac?, vlan, owner_team, service, source_ref (`ip-addresses.conf` key), status | +| **Physical port map** | switch_id, switch_port, panel_ref, far_end_host, far_end_nic, vlan_membership, speed, lacp_group | +| **Host / hypervisor** | serial, model, cluster node, CPU/RAM/disk summary (from poll script / Proxmox) | +| **License / entitlement** | sku_id, seat_count, valid_from/to, bound_org_or_project, external_ref (Stripe/subscription id) | +| **Provisioning job** | type (create_ct, resize_disk, assign_ip), payload, status, correlation_id | + +Start with **Postgres** (you already run many PG instances; a **dedicated small CT** for IT data avoids coupling to app databases). + +### 3.4 Billing and licenses + +Treat **billing** as **integrations**, not a from-scratch ERP: + +- **Licenses / seats:** map to **entitlements** table + Keycloak **groups** or custom claims for “can open IT console / can approve provision.” +- **Usage metering:** Proxmox **storage and CPU** per VMID, NPM bandwidth (optional), public IP count — **async jobs** pushing aggregates nightly. +- **Invoicing:** export to **Stripe Billing**, **QuickBooks**, or **NetSuite** via CSV/API; the controller shows **status** and **line items**, not necessarily full double-entry ledger on day one. + +Partner marketplace pricing already has patterns in **`dbis_core`**; **native** infra SKUs should either **reuse** `IruOffering`-style tables or **link** by `external_sku_id` to avoid two unrelated catalogs. + +--- + +## 4. VLAN and efficiency priorities (what matters most first) + +Aligned with [NETWORK_CONFIGURATION_MASTER.md](../11-references/NETWORK_CONFIGURATION_MASTER.md): + +1. **Document + enforce IP uniqueness** before VLAN migration (ARP incidents already noted around Keycloak IP in E2E docs). Automated **diff**: live `ip neigh` / Proxmox CT IPs vs IPAM. +2. **Segment in this order:** (a) **out-of-band / IPMI** if any, (b) **tenant-facing** workloads (VLAN 200+), (c) **Besu validators/RPC**, (d) **Sankofa app tier** — so blast radius reduction matches risk. +3. **Use spare cluster capacity:** **r630-03** / **r630-04** are cluster members with large local/ceph-related storage; placing **new** stateless or batch workloads there reduces pressure on r630-01/02 (see network master narrative). +4. **ML110 cutover:** WAN aggregator repurpose changes **.10** from Proxmox to firewall; the controller’s IPAM must flag **migration status** per host. + +--- + +## 5. Port mapping deliverables + +| Layer | Tool / owner | Output | +|-------|----------------|--------| +| **Physical (UniFi XG + patch)** | IT + DCIM template | Spreadsheet or **NetBox** cables + interfaces | +| **UDM** | UniFi export + manual | Port forward matrix (already partially in network master) | +| **NPM** | `scripts/nginx-proxy-manager/update-npmplus-proxy-hosts-api.sh` + API | Proxy host rows = **logical** port map to upstream | +| **Proxmox** | `vmbr`, VLAN-aware flags | Map CT `net0` → bridge → VLAN | + +The HTML controller should show a **joined view**: *public hostname → NPM → LAN IP:port → VMID → node → switch port* (where data exists). + +--- + +## 5.1 Live data strategy (source of truth) + +| Layer | Primary live source | Declared fallback | Drift handling | +|-------|---------------------|-------------------|----------------| +| **VMID, node, status, guest IP** | Proxmox: `pvesh get /cluster/resources` + guest config files on shared `/etc/pve` | [ALL_VMIDS_ENDPOINTS.md](../04-configuration/ALL_VMIDS_ENDPOINTS.md) | VMID/IP mismatch; guests only in doc or only on cluster | +| **Hypervisor capacity** | `scripts/verify/poll-proxmox-cluster-hardware.sh` | [PROXMOX_HOSTS_COMPLETE_HARDWARE_CONFIG.md](PROXMOX_HOSTS_COMPLETE_HARDWARE_CONFIG.md) | Refresh after hardware changes | +| **LAN env keys** | Parsed literals from `ip-addresses.conf` | Same file in git | `guest_ips_not_in_ip_addresses_conf` vs `ip_addresses_conf_ips_not_on_guests`; exclude `PROXMOX_HOST_*`, `NETWORK_GATEWAY`, `UDM_PRO_*`, `WAN_AGGREGATOR_*` from “missing guest” noise | +| **Public edge** | NPM API (fleet scripts) | E2E tables | Hostname → upstream drift | +| **Switch/AP** | UniFi Network API | NetBox / spreadsheet | Manual until imported | + +**Freshness:** every artifact includes ISO8601 **`collected_at`**; failed collectors must record **`error`** in `drift.json` and must not be presented as current in the IT UI. + +--- + +## 6. Phased roadmap + +| Phase | Scope | Exit criteria | +|-------|--------|----------------| +| **0 — Inventory hardening (live-first)** | **Runtime truth:** Proxmox `pvesh /cluster/resources` + per-guest config (`net0` / `ipconfig0`) for IP, merged with **`config/ip-addresses.conf`** as **declared** literals; emit **`live_inventory.json`** + **`drift.json`** with **`collected_at`**; duplicate guest IPs → fail or alert. **Scripts (add under `scripts/it-ops/`):** `export-live-inventory-and-drift.sh` (SSH to seed node, pipe `lib/collect_inventory_remote.py`), `compute_ipam_drift.py` (merge + drift). **CI:** `.github/workflows/live-inventory-drift.yml` — `workflow_dispatch` + weekly schedule; on GitHub-hosted runners without LAN, collector exits 0 after writing `drift.json` with `seed_unreachable`. **UI/BFF later:** never show inventory without freshness metadata. | +| **1 — Read-only IT dashboard** | Keycloak group `sankofa-it-admin`; SPA pages: IPs, VLAN plan (current vs target), cluster nodes, hardware poll link | IT can onboard without SSH | +| **2 — Port map CRUD** | DB + UI for switch/port; import from UniFi API | Export CSV/NetBox | +| **3 — Controlled provisioning** | BFF + Proxmox API: start/stop scoped CT, **dry-run default** (align with `proxmox-production-safety` rules) | Audit log + allowlists | +| **4 — Entitlements + billing hooks** | License assignment UI; Stripe (or chosen) webhook → entitlement | Invoice export for finance | + +--- + +## 7. Security and governance + +- **Separate** IT super-admin from **client** `admin.sankofa.nexus` users (different Keycloak groups). +- **MFA** required for IT group; **break-glass** local Proxmox access documented, not exposed in UI. +- **Change management:** any **write** to network edge (UDM) or **production** Proxmox requires ticket id in API payload (optional field, enforced in policy later). + +--- + +## 8. Related documents + +| Topic | Doc | +|-------|-----| +| IPs, VLAN plan, port forwards | [NETWORK_CONFIGURATION_MASTER.md](../11-references/NETWORK_CONFIGURATION_MASTER.md) | +| VMID ↔ IP | [ALL_VMIDS_ENDPOINTS.md](../04-configuration/ALL_VMIDS_ENDPOINTS.md) | +| Cabling / 10G | [13_NODE_NETWORK_AND_CABLING_CHECKLIST.md](../11-references/13_NODE_NETWORK_AND_CABLING_CHECKLIST.md) | +| Marketplace vs portal | [SANKOFA_MARKETPLACE_SURFACES.md](../03-deployment/SANKOFA_MARKETPLACE_SURFACES.md) | +| FQDN roles | [EXPECTED_WEB_CONTENT.md](EXPECTED_WEB_CONTENT.md) | +| Hardware poll | `scripts/verify/poll-proxmox-cluster-hardware.sh`, `reports/status/hardware_and_connected_inventory_*.md` | +| Proxmox safety | `.cursor/rules/proxmox-production-safety.mdc`, `scripts/lib/proxmox-production-guard.sh` | + +--- + +## 9. Next engineering actions (concrete) + +**Done in-repo (Phase 0+):** + +1. **`scripts/it-ops/`** — remote collector (`lib/collect_inventory_remote.py`), `compute_ipam_drift.py` (merges **`ip-addresses.conf`** + **`ALL_VMIDS_ENDPOINTS.md`** table rows), `export-live-inventory-and-drift.sh` → `reports/status/live_inventory.json` + `drift.json`. +2. **Read API stub** — `services/sankofa-it-read-api/server.py` (GET `/health`, `/v1/inventory/live`, `/v1/inventory/drift`; POST refresh with API key). systemd example: `config/systemd/sankofa-it-read-api.service.example`. +3. **Workflow** `.github/workflows/live-inventory-drift.yml` — `workflow_dispatch` + weekly; artifacts; no LAN on default runners. +4. **Validation** — `scripts/validation/validate-config-files.sh` runs `py_compile` on IT scripts + read API. +5. **Docs** — [SANKOFA_IT_OPS_LIVE_INVENTORY_SCRIPTS.md](../03-deployment/SANKOFA_IT_OPS_LIVE_INVENTORY_SCRIPTS.md), [SANKOFA_IT_OPS_KEYCLOAK_PORTAL_NEXT_STEPS.md](../03-deployment/SANKOFA_IT_OPS_KEYCLOAK_PORTAL_NEXT_STEPS.md). +6. **Keycloak automation (proxmox repo)** — `scripts/deployment/keycloak-sankofa-ensure-it-admin-role.sh` creates realm role **`sankofa-it-admin`**; operators still assign the role to users in Admin Console. +7. **Portal `/it` (Sankofa/portal repo, sibling clone)** — `src/app/it/page.tsx`, `src/app/api/it/*` (server proxy + `IT_READ_API_URL` / `IT_READ_API_KEY` on CT 7801); credentials **`ADMIN`** propagated into JWT roles for bootstrap (`src/lib/auth.ts`). +8. **LAN schedule examples** — `config/systemd/sankofa-it-inventory-export.timer.example` + `.service.example` for weekly `export-live-inventory-and-drift.sh`. + +**Remaining (other repos / product):** + +1. **Full BFF** with OIDC (Keycloak) and Postgres — **`dbis_core` vs dedicated CT** — decide once. +2. **Keycloak** — assign **`sankofa-it-admin`** to real IT users (role creation is scripted; mapping is manual policy). +3. **Deploy** — `sync-sankofa-portal-7801.sh` after pulling portal changes; set **`IT_READ_API_URL`** on the portal LXC. +4. **Schedule on LAN** — enable the timer on a host with repo + SSH to Proxmox; optional same cadence for `poll-proxmox-cluster-hardware.sh`. +5. **UniFi / NPM** live collectors — Phase 2 of this spec. + +This spec does **not** replace change control; it gives you a **single product vision** so IP, VLAN, ports, hosts, licenses, and billing support evolve together instead of in silos. diff --git a/docs/03-deployment/SANKOFA_IT_OPS_KEYCLOAK_PORTAL_NEXT_STEPS.md b/docs/03-deployment/SANKOFA_IT_OPS_KEYCLOAK_PORTAL_NEXT_STEPS.md new file mode 100644 index 0000000..d175c2b --- /dev/null +++ b/docs/03-deployment/SANKOFA_IT_OPS_KEYCLOAK_PORTAL_NEXT_STEPS.md @@ -0,0 +1,48 @@ +# IT operations UI — Keycloak and Sankofa portal next steps + +**Purpose:** Close the gap between Phase 0 (live inventory scripts + read API) and the full **Sankofa admin** IT controller described in [SANKOFA_IT_OPERATIONS_CONTROLLER_SPEC.md](../02-architecture/SANKOFA_IT_OPERATIONS_CONTROLLER_SPEC.md). + +--- + +## 1. Keycloak + +1. Create realm role **`sankofa-it-admin`** (idempotent): `bash scripts/deployment/keycloak-sankofa-ensure-it-admin-role.sh` (needs `KEYCLOAK_ADMIN_PASSWORD` in repo `.env`, SSH to Proxmox, CT 7802). Then assign the role to IT staff in the Keycloak Admin Console (or use a group + token mapper if you prefer group claims). +2. Map **only** platform IT staff; require **MFA** at realm or IdP policy. +3. **Do not** reuse client-admin groups used for `admin.sankofa.nexus` tenant administration unless policy explicitly allows. +4. Optional: client scope **it-ops** with claim `it_admin=true` for the IT BFF audience. + +**Reference:** Keycloak CT / VMID in [ALL_VMIDS_ENDPOINTS.md](../04-configuration/ALL_VMIDS_ENDPOINTS.md); portal login runbook `scripts/deployment/enable-sankofa-portal-login-7801.sh`. + +--- + +## 2. Sankofa portal (`Sankofa/portal` repo) + +1. **Implemented:** protected route **`/it`** (`src/app/it/page.tsx`) gated by **`sankofa-it-admin`** / **`ADMIN`** (credentials bootstrap). API proxies: `GET /api/it/drift`, `GET /api/it/inventory`, `POST /api/it/refresh`. +2. **Configure on CT 7801:** **`IT_READ_API_URL`** (e.g. `http://192.168.11.:8787`) and optional **`IT_READ_API_KEY`** (server-only; never `NEXT_PUBLIC_*`). Proxies to the read API on VLAN 11. +3. **Do not** expose `IT_READ_API_KEY` or Proxmox credentials to the browser bundle. +4. Display **`collected_at`** from JSON; show a stale warning if older than your SLO (e.g. 24h). + +**Deploy:** `scripts/deployment/sync-sankofa-portal-7801.sh` after portal changes. + +--- + +## 3. NPM + +Add an **internal** proxy host (optional TLS) from a hostname such as `it-api.sankofa.nexus` (LAN-only DNS) to **`127.0.0.1:8787`** on the host running the read API, **or** bind the service on a dedicated CT IP and point NPM at that upstream. + +--- + +## 4. Full BFF (later) + +Replace `services/sankofa-it-read-api/server.py` with a service that: + +- Validates **OIDC** (Keycloak) JWTs. +- Stores **audit** rows for refresh and future writes. +- Adds **UniFi** and **NPM** collectors with `collected_at` per domain. + +--- + +## Related + +- [SANKOFA_IT_OPS_LIVE_INVENTORY_SCRIPTS.md](SANKOFA_IT_OPS_LIVE_INVENTORY_SCRIPTS.md) +- [SANKOFA_MARKETPLACE_SURFACES.md](SANKOFA_MARKETPLACE_SURFACES.md) (native vs partner; catalog alignment) diff --git a/docs/03-deployment/SANKOFA_IT_OPS_LIVE_INVENTORY_SCRIPTS.md b/docs/03-deployment/SANKOFA_IT_OPS_LIVE_INVENTORY_SCRIPTS.md new file mode 100644 index 0000000..ecd99c4 --- /dev/null +++ b/docs/03-deployment/SANKOFA_IT_OPS_LIVE_INVENTORY_SCRIPTS.md @@ -0,0 +1,368 @@ +# IT ops Phase 0 — live inventory scripts (implementation appendix) + +**Purpose:** Canonical copy of Phase 0 scripts (also on disk under `scripts/it-ops/`). Use this page if you need to restore or review inline. +**Spec:** [SANKOFA_IT_OPERATIONS_CONTROLLER_SPEC.md](../02-architecture/SANKOFA_IT_OPERATIONS_CONTROLLER_SPEC.md) section 5.1 and Phase 0. + +## File layout + +| Path | Role | +|------|------| +| `scripts/it-ops/lib/collect_inventory_remote.py` | Run on PVE via SSH stdin (`python3 -`) | +| `scripts/it-ops/compute_ipam_drift.py` | Local: merge live JSON + `config/ip-addresses.conf` + **`ALL_VMIDS_ENDPOINTS.md`** pipe tables (`--all-vmids-md`) | +| `scripts/it-ops/export-live-inventory-and-drift.sh` | Orchestrator: ping seed, SSH, write `reports/status/` | +| `services/sankofa-it-read-api/server.py` | Read-only HTTP: `/v1/inventory/live`, `/v1/inventory/drift` | +| `.github/workflows/live-inventory-drift.yml` | `workflow_dispatch` + weekly (graceful skip without LAN) | + +**Exit codes (`compute_ipam_drift.py`):** **2** = duplicate guest IP; **0** otherwise. **`vmid_ip_mismatch_live_vs_all_vmids_doc`** in `drift.json` is informational (docs often lag live CT config). + +--- + +## `scripts/it-ops/lib/collect_inventory_remote.py` + +```python +#!/usr/bin/env python3 +"""Run ON a Proxmox cluster node (as root). Stdout: JSON live guest inventory.""" +from __future__ import annotations + +import json +import re +import subprocess +import sys +from datetime import datetime, timezone + + +def _run(cmd: list[str]) -> str: + return subprocess.check_output(cmd, text=True, stderr=subprocess.DEVNULL) + + +def _extract_ip_from_net_line(line: str) -> str | None: + m = re.search(r"ip=([0-9.]+)", line) + return m.group(1) if m else None + + +def _read_config(path: str) -> str: + try: + with open(path, encoding="utf-8", errors="replace") as f: + return f.read() + except OSError: + return "" + + +def main() -> None: + collected_at = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ") + try: + raw = _run( + ["pvesh", "get", "/cluster/resources", "--output-format", "json"] + ) + resources = json.loads(raw) + except (subprocess.CalledProcessError, json.JSONDecodeError) as e: + json.dump( + { + "collected_at": collected_at, + "error": f"pvesh_cluster_resources_failed: {e}", + "guests": [], + }, + sys.stdout, + indent=2, + ) + return + + guests: list[dict] = [] + for r in resources: + t = r.get("type") + if t not in ("lxc", "qemu"): + continue + vmid = r.get("vmid") + node = r.get("node") + if vmid is None or not node: + continue + vmid_s = str(vmid) + name = r.get("name") or "" + status = r.get("status") or "" + + if t == "lxc": + cfg_path = f"/etc/pve/nodes/{node}/lxc/{vmid_s}.conf" + else: + cfg_path = f"/etc/pve/nodes/{node}/qemu-server/{vmid_s}.conf" + + body = _read_config(cfg_path) + ip = "" + for line in body.splitlines(): + if line.startswith("net0:"): + got = _extract_ip_from_net_line(line) + if got: + ip = got + break + if not ip and t == "qemu": + for line in body.splitlines(): + if line.startswith("ipconfig0:"): + got = _extract_ip_from_net_line(line) + if got: + ip = got + break + if not ip and t == "qemu": + for line in body.splitlines(): + if line.startswith("net0:"): + got = _extract_ip_from_net_line(line) + if got: + ip = got + break + + guests.append( + { + "vmid": vmid_s, + "type": t, + "node": str(node), + "name": name, + "status": status, + "ip": ip, + "config_path": cfg_path, + } + ) + + out = { + "collected_at": collected_at, + "guests": sorted(guests, key=lambda g: int(g["vmid"])), + } + json.dump(out, sys.stdout, indent=2) + + +if __name__ == "__main__": + main() +``` + +--- + +## `scripts/it-ops/compute_ipam_drift.py` + +```python +#!/usr/bin/env python3 +"""Merge live JSON with config/ip-addresses.conf; write live_inventory.json + drift.json.""" +from __future__ import annotations + +import argparse +import json +import re +import sys +from pathlib import Path + +IPV4_RE = re.compile( + r"(? tuple[dict[str, str], set[str]]: + var_map: dict[str, str] = {} + all_ips: set[str] = set() + if not path.is_file(): + return var_map, all_ips + for line in path.read_text(encoding="utf-8", errors="replace").splitlines(): + s = line.strip() + if not s or s.startswith("#") or "=" not in s: + continue + key, _, val = s.partition("=") + key = key.strip() + val = val.strip() + if val.startswith('"') and val.endswith('"'): + val = val[1:-1] + elif val.startswith("'") and val.endswith("'"): + val = val[1:-1] + var_map[key] = val + for m in IPV4_RE.findall(val): + all_ips.add(m) + return var_map, all_ips + + +def hypervisor_related_keys(var_map: dict[str, str]) -> set[str]: + keys = set() + for k in var_map: + ku = k.upper() + if any( + x in ku + for x in ( + "PROXMOX_HOST", + "PROXMOX_ML110", + "PROXMOX_R630", + "PROXMOX_R750", + "WAN_AGGREGATOR", + "NETWORK_GATEWAY", + "UDM_PRO", + "PUBLIC_IP_GATEWAY", + "PUBLIC_IP_ER605", + ) + ): + keys.add(k) + return keys + + +def main() -> None: + ap = argparse.ArgumentParser() + ap.add_argument("--live", type=Path, help="live JSON file (default stdin)") + ap.add_argument("--ip-conf", type=Path, default=Path("config/ip-addresses.conf")) + ap.add_argument("--out-dir", type=Path, required=True) + args = ap.parse_args() + + live_raw = args.live.read_text(encoding="utf-8") if args.live else sys.stdin.read() + live = json.loads(live_raw) + guests = live.get("guests") or [] + var_map, conf_ips = parse_ip_addresses_conf(args.ip_conf) + hyp_keys = hypervisor_related_keys(var_map) + hyp_ips: set[str] = set() + for k in hyp_keys: + if k not in var_map: + continue + for m in IPV4_RE.findall(var_map[k]): + hyp_ips.add(m) + + ip_to_vmids: dict[str, list[str]] = {} + for g in guests: + ip = (g.get("ip") or "").strip() + if not ip: + continue + ip_to_vmids.setdefault(ip, []).append(g.get("vmid", "?")) + + duplicate_ips = {ip: vms for ip, vms in ip_to_vmids.items() if len(vms) > 1} + guest_ip_set = set(ip_to_vmids.keys()) + conf_only = sorted(conf_ips - guest_ip_set - hyp_ips) + live_only = sorted(guest_ip_set - conf_ips) + + drift = { + "collected_at": live.get("collected_at"), + "guest_count": len(guests), + "duplicate_ips": duplicate_ips, + "guest_ips_not_in_ip_addresses_conf": live_only, + "ip_addresses_conf_ips_not_on_guests": conf_only, + "hypervisor_and_infra_ips_excluded_from_guest_match": sorted(hyp_ips), + "notes": [], + } + if live.get("error"): + drift["notes"].append(live["error"]) + + inv_out = { + "collected_at": live.get("collected_at"), + "source": "proxmox_cluster_pvesh_plus_config", + "guests": guests, + } + + args.out_dir.mkdir(parents=True, exist_ok=True) + (args.out_dir / "live_inventory.json").write_text( + json.dumps(inv_out, indent=2), encoding="utf-8" + ) + (args.out_dir / "drift.json").write_text( + json.dumps(drift, indent=2), encoding="utf-8" + ) + print(f"Wrote {args.out_dir / 'live_inventory.json'}") + print(f"Wrote {args.out_dir / 'drift.json'}") + sys.exit(2 if duplicate_ips else 0) + + +if __name__ == "__main__": + main() +``` + +--- + +## `scripts/it-ops/export-live-inventory-and-drift.sh` + +```bash +#!/usr/bin/env bash +# Live Proxmox guest inventory + drift vs config/ip-addresses.conf. +# Usage: bash scripts/it-ops/export-live-inventory-and-drift.sh +# Requires: SSH key root@SEED, python3 locally and on PVE. +set -euo pipefail +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +PROJECT_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)" +# shellcheck source=/dev/null +source "${PROJECT_ROOT}/config/ip-addresses.conf" 2>/dev/null || true +SEED="${SEED_HOST:-${PROXMOX_HOST_R630_01:-192.168.11.11}}" +OUT_DIR="${OUT_DIR:-${PROJECT_ROOT}/reports/status}" +TS="$(date +%Y%m%d_%H%M%S)" +TMP="${TMPDIR:-/tmp}/live_inv_${TS}.json" +PY="${SCRIPT_DIR}/lib/collect_inventory_remote.py" + +mkdir -p "$OUT_DIR" + +stub_unreachable() { + python3 - </dev/null 2>&1; then + stub_unreachable >"$TMP" +else + if ! ssh -o BatchMode=yes -o ConnectTimeout=15 -o StrictHostKeyChecking=no \ + "root@${SEED}" "python3 -" <"$PY" >"$TMP" 2>/dev/null; then + stub_unreachable >"$TMP" + fi +fi + +set +e +python3 "${SCRIPT_DIR}/compute_ipam_drift.py" --live "$TMP" \ + --ip-conf "${PROJECT_ROOT}/config/ip-addresses.conf" --out-dir "$OUT_DIR" +DRIFT_RC=$? +set -e + +cp -f "$OUT_DIR/live_inventory.json" "${OUT_DIR}/live_inventory_${TS}.json" 2>/dev/null || true +cp -f "$OUT_DIR/drift.json" "${OUT_DIR}/drift_${TS}.json" 2>/dev/null || true +rm -f "$TMP" +echo "Latest: ${OUT_DIR}/live_inventory.json , ${OUT_DIR}/drift.json" +# Exit 2 when duplicate_ips present (for CI). +exit "${DRIFT_RC}" +``` + +After creating files: `chmod +x scripts/it-ops/export-live-inventory-and-drift.sh scripts/it-ops/compute_ipam_drift.py` + +--- + +## `.github/workflows/live-inventory-drift.yml` + +```yaml +name: Live inventory and IPAM drift + +on: + workflow_dispatch: + schedule: + - cron: '25 6 * * 1' + +jobs: + drift: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - name: Export live inventory (LAN optional) + run: | + set +e + bash scripts/it-ops/export-live-inventory-and-drift.sh + echo "exit=$?" + continue-on-error: true + - name: Upload artifacts + uses: actions/upload-artifact@v4 + if: always() + with: + name: live-inventory-drift + path: | + reports/status/live_inventory.json + reports/status/drift.json +``` + +**Note:** On GitHub-hosted runners the collector usually writes `seed_unreachable`; use a **self-hosted LAN runner** for real data, or run the shell script on the operator workstation. + +--- + +## `AGENTS.md` row (Quick pointers table) + +Add: + +`| IT live inventory + drift (LAN) | `bash scripts/it-ops/export-live-inventory-and-drift.sh` → `reports/status/live_inventory.json`, `drift.json` — see [docs/03-deployment/SANKOFA_IT_OPS_LIVE_INVENTORY_SCRIPTS.md](docs/03-deployment/SANKOFA_IT_OPS_LIVE_INVENTORY_SCRIPTS.md) |` + +--- + +## `docs/MASTER_INDEX.md` + +Add a row pointing to this deployment appendix and the updated spec. diff --git a/scripts/deployment/keycloak-sankofa-ensure-it-admin-role.sh b/scripts/deployment/keycloak-sankofa-ensure-it-admin-role.sh new file mode 100755 index 0000000..bd37b68 --- /dev/null +++ b/scripts/deployment/keycloak-sankofa-ensure-it-admin-role.sh @@ -0,0 +1,120 @@ +#!/usr/bin/env bash +# Create Keycloak realm role sankofa-it-admin if missing (IT operations portal /it gate). +# Runs Admin API against http://127.0.0.1:8080 inside the Keycloak CT (same pattern as +# keycloak-sankofa-ensure-client-redirects-via-proxmox-pct.sh). +# +# After the role exists, assign it to IT staff in Keycloak Admin (Users → Role mapping) +# or map it to a group and add a token mapper if you rely on group claims. +# +# Env: KEYCLOAK_ADMIN_PASSWORD in repo .env; optional KEYCLOAK_REALM (default master), +# KEYCLOAK_CT_VMID (7802), PROXMOX_HOST. +# +# Usage: +# ./scripts/deployment/keycloak-sankofa-ensure-it-admin-role.sh [--dry-run] +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +PROJECT_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)" +# shellcheck source=/dev/null +source "${PROJECT_ROOT}/config/ip-addresses.conf" 2>/dev/null || true +if [ -f "$PROJECT_ROOT/.env" ]; then + set +u + set -a + # shellcheck source=/dev/null + source "$PROJECT_ROOT/.env" 2>/dev/null || true + set +a + set -u +fi + +PROXMOX_HOST="${PROXMOX_HOST:-${PROXMOX_HOST_R630_01:-192.168.11.11}}" +KEYCLOAK_CT_VMID="${KEYCLOAK_CT_VMID:-${SANKOFA_KEYCLOAK_VMID:-7802}}" +REALM="${KEYCLOAK_REALM:-master}" +ADMIN_USER="${KEYCLOAK_ADMIN:-admin}" +ADMIN_PASS="${KEYCLOAK_ADMIN_PASSWORD:-}" +ROLE_NAME="${SANKOFA_IT_ADMIN_ROLE_NAME:-sankofa-it-admin}" +SSH_OPTS=(-o BatchMode=yes -o StrictHostKeyChecking=accept-new -o ConnectTimeout=15) + +DRY=0 +[[ "${1:-}" == "--dry-run" ]] && DRY=1 + +if [ -z "$ADMIN_PASS" ]; then + echo "KEYCLOAK_ADMIN_PASSWORD is not set in .env" >&2 + exit 1 +fi + +if [ "$DRY" = 1 ]; then + echo "[dry-run] Would ssh root@${PROXMOX_HOST} pct exec ${KEYCLOAK_CT_VMID} -- python3 (ensure realm role ${ROLE_NAME} in realm ${REALM})" + exit 0 +fi + +ssh "${SSH_OPTS[@]}" "root@${PROXMOX_HOST}" \ + "pct exec ${KEYCLOAK_CT_VMID} -- env KC_PASS=\"${ADMIN_PASS}\" ADMUSER=\"${ADMIN_USER}\" REALM=\"${REALM}\" ROLE_NAME=\"${ROLE_NAME}\" python3 -u -" <<'PY' +import json +import os +import urllib.error +import urllib.parse +import urllib.request + +base = "http://127.0.0.1:8080" +realm = os.environ["REALM"] +role_name = os.environ["ROLE_NAME"] +admin_user = os.environ["ADMUSER"] +password = os.environ["KC_PASS"] + + +def post_form(url: str, data: dict) -> dict: + body = urllib.parse.urlencode(data).encode() + req = urllib.request.Request(url, data=body, method="POST") + with urllib.request.urlopen(req, timeout=60) as resp: + return json.loads(resp.read().decode()) + + +tok = post_form( + f"{base}/realms/master/protocol/openid-connect/token", + { + "grant_type": "password", + "client_id": "admin-cli", + "username": admin_user, + "password": password, + }, +) +access = tok.get("access_token") +if not access: + raise SystemExit(f"token failed: {tok}") + +headers = {"Authorization": f"Bearer {access}"} +role_url = f"{base}/admin/realms/{realm}/roles/{urllib.parse.quote(role_name, safe='')}" +req_get = urllib.request.Request(role_url, headers=headers) +try: + with urllib.request.urlopen(req_get, timeout=60) as resp: + if resp.getcode() in (200, 204): + print(f"Realm role {role_name!r} already exists in {realm!r}.", flush=True) + raise SystemExit(0) +except urllib.error.HTTPError as e: + if e.code != 404: + err = e.read().decode() if e.fp else str(e) + raise SystemExit(f"GET role failed HTTP {e.code}: {err}") from e + +payload = json.dumps( + { + "name": role_name, + "description": "Sankofa IT operations (portal /it, inventory read API consumers)", + "clientRole": False, + } +).encode() +req_post = urllib.request.Request( + f"{base}/admin/realms/{realm}/roles", + data=payload, + headers={**headers, "Content-Type": "application/json"}, + method="POST", +) +try: + with urllib.request.urlopen(req_post, timeout=120) as resp: + if resp.getcode() not in (200, 201): + raise SystemExit(f"create role unexpected HTTP {resp.getcode()}") +except urllib.error.HTTPError as e: + err = e.read().decode() if e.fp else str(e) + raise SystemExit(f"POST role failed HTTP {e.code}: {err}") from e + +print(f"Created realm role {role_name!r} in realm {realm!r}. Assign it to IT users in Admin Console.", flush=True) +PY diff --git a/scripts/deployment/sync-sankofa-portal-7801.sh b/scripts/deployment/sync-sankofa-portal-7801.sh index 5cfea61..d20f4ca 100755 --- a/scripts/deployment/sync-sankofa-portal-7801.sh +++ b/scripts/deployment/sync-sankofa-portal-7801.sh @@ -106,5 +106,6 @@ echo "✅ Done. Verify:" echo " curl -sS http://${IP_SANKOFA_PORTAL:-192.168.11.51}:3000/ | head -c 120" echo " curl -sSI https://portal.sankofa.nexus/api/auth/signin | head -n 15" echo " https://portal.sankofa.nexus/ (via NPM; corporate apex is sankofa.nexus → IP_SANKOFA_PUBLIC_WEB)" +echo " IT /it console: set IT_READ_API_URL (and optional IT_READ_API_KEY) in CT ${CT_APP_DIR}/.env — see portal/.env.example" echo "" echo "Legacy apex auth URL only if needed: SANKOFA_PORTAL_NEXTAUTH_URL=https://sankofa.nexus $0" diff --git a/scripts/it-ops/compute_ipam_drift.py b/scripts/it-ops/compute_ipam_drift.py new file mode 100755 index 0000000..7417f2c --- /dev/null +++ b/scripts/it-ops/compute_ipam_drift.py @@ -0,0 +1,203 @@ +#!/usr/bin/env python3 +"""Merge live JSON with config/ip-addresses.conf; write live_inventory.json + drift.json.""" +from __future__ import annotations + +import argparse +import json +import re +import sys +from pathlib import Path + +IPV4_RE = re.compile( + r"(? bool: + return ip.startswith("192.168.11.") + + +def parse_all_vmids_markdown(path: Path) -> tuple[set[str], dict[str, str]]: + """Extract declared LAN IPs and vmid->ip from ALL_VMIDS pipe tables.""" + ips: set[str] = set() + vmid_to_ip: dict[str, str] = {} + if not path.is_file(): + return ips, vmid_to_ip + for line in path.read_text(encoding="utf-8", errors="replace").splitlines(): + m = MD_VMID_IP_ROW.match(line.strip()) + if not m: + continue + vmid, ip = m.group(1), m.group(2) + if is_lan_11(ip): + ips.add(ip) + vmid_to_ip[vmid] = ip + return ips, vmid_to_ip + + +def parse_ip_addresses_conf(path: Path) -> tuple[dict[str, str], set[str]]: + var_map: dict[str, str] = {} + all_ips: set[str] = set() + if not path.is_file(): + return var_map, all_ips + for line in path.read_text(encoding="utf-8", errors="replace").splitlines(): + s = line.strip() + if not s or s.startswith("#") or "=" not in s: + continue + key, _, val = s.partition("=") + key = key.strip() + val = val.strip() + if val.startswith('"') and val.endswith('"'): + val = val[1:-1] + elif val.startswith("'") and val.endswith("'"): + val = val[1:-1] + var_map[key] = val + for m in IPV4_RE.findall(val): + all_ips.add(m) + return var_map, all_ips + + +def hypervisor_related_keys(var_map: dict[str, str]) -> set[str]: + keys = set() + for k in var_map: + ku = k.upper() + if any( + x in ku + for x in ( + "PROXMOX_HOST", + "PROXMOX_ML110", + "PROXMOX_R630", + "PROXMOX_R750", + "WAN_AGGREGATOR", + "NETWORK_GATEWAY", + "UDM_PRO", + "PUBLIC_IP_GATEWAY", + "PUBLIC_IP_ER605", + ) + ): + keys.add(k) + return keys + + +def main() -> None: + ap = argparse.ArgumentParser() + ap.add_argument("--live", type=Path, help="live JSON file (default stdin)") + ap.add_argument( + "--ip-conf", + type=Path, + default=Path("config/ip-addresses.conf"), + help="path to ip-addresses.conf", + ) + ap.add_argument("--out-dir", type=Path, required=True) + ap.add_argument( + "--all-vmids-md", + type=Path, + default=None, + help="optional ALL_VMIDS_ENDPOINTS.md for declared VMID/IP tables", + ) + args = ap.parse_args() + + if args.live: + live_raw = args.live.read_text(encoding="utf-8") + else: + live_raw = sys.stdin.read() + + try: + live = json.loads(live_raw) + except json.JSONDecodeError as e: + print(f"Invalid live JSON: {e}", file=sys.stderr) + sys.exit(1) + + guests = live.get("guests") or [] + var_map, conf_ips = parse_ip_addresses_conf(args.ip_conf) + doc_ips: set[str] = set() + vmid_to_ip_doc: dict[str, str] = {} + if args.all_vmids_md: + doc_ips, vmid_to_ip_doc = parse_all_vmids_markdown(args.all_vmids_md) + + declared_union = conf_ips | doc_ips + hyp_keys = hypervisor_related_keys(var_map) + hyp_ips: set[str] = set() + for k in hyp_keys: + if k not in var_map: + continue + for m in IPV4_RE.findall(var_map[k]): + hyp_ips.add(m) + + ip_to_vmids: dict[str, list[str]] = {} + vmid_to_ip_live: dict[str, str] = {} + for g in guests: + ip = (g.get("ip") or "").strip() + vmid = str(g.get("vmid", "")).strip() + if ip: + ip_to_vmids.setdefault(ip, []).append(vmid or "?") + if vmid and ip: + vmid_to_ip_live[vmid] = ip + + duplicate_ips = {ip: vms for ip, vms in ip_to_vmids.items() if len(vms) > 1} + guest_ip_set = set(ip_to_vmids.keys()) + + conf_only = sorted(conf_ips - guest_ip_set - hyp_ips) + live_only_legacy = sorted(guest_ip_set - conf_ips) + + declared_lan11 = {ip for ip in declared_union if is_lan_11(ip)} + guest_lan11 = {ip for ip in guest_ip_set if is_lan_11(ip)} + guest_lan_not_declared = sorted( + guest_lan11 - declared_union - hyp_ips + ) + declared_lan11_not_on_guests = sorted( + declared_lan11 - guest_ip_set - hyp_ips + ) + + vmid_ip_mismatch: list[dict[str, str]] = [] + for vmid, doc_ip in vmid_to_ip_doc.items(): + lip = vmid_to_ip_live.get(vmid) + if lip and doc_ip and lip != doc_ip: + vmid_ip_mismatch.append( + {"vmid": vmid, "live_ip": lip, "all_vmids_doc_ip": doc_ip} + ) + + drift = { + "collected_at": live.get("collected_at"), + "guest_count": len(guests), + "duplicate_ips": duplicate_ips, + "guest_ips_not_in_ip_addresses_conf": live_only_legacy, + "ip_addresses_conf_ips_not_on_guests": conf_only, + "guest_lan_ips_not_in_declared_sources": guest_lan_not_declared, + "declared_lan11_ips_not_on_live_guests": declared_lan11_not_on_guests, + "vmid_ip_mismatch_live_vs_all_vmids_doc": vmid_ip_mismatch, + "hypervisor_and_infra_ips_excluded_from_guest_match": sorted(hyp_ips), + "declared_sources": { + "ip_addresses_conf_ipv4_count": len(conf_ips), + "all_vmids_md_lan11_count": len(doc_ips), + }, + "notes": [], + } + if live.get("error"): + drift["notes"].append(str(live["error"])) + + inv_out = { + "collected_at": live.get("collected_at"), + "source": "proxmox_cluster_pvesh_plus_config", + "guests": guests, + } + + args.out_dir.mkdir(parents=True, exist_ok=True) + (args.out_dir / "live_inventory.json").write_text( + json.dumps(inv_out, indent=2), encoding="utf-8" + ) + (args.out_dir / "drift.json").write_text( + json.dumps(drift, indent=2), encoding="utf-8" + ) + print(f"Wrote {args.out_dir / 'live_inventory.json'}") + print(f"Wrote {args.out_dir / 'drift.json'}") + # Exit 2 only for duplicate guest IPs (hard failure). VMID vs ALL_VMIDS doc + # mismatches are informational — documentation often lags live `pct set`. + sys.exit(2 if duplicate_ips else 0) + + +if __name__ == "__main__": + main() diff --git a/scripts/it-ops/export-live-inventory-and-drift.sh b/scripts/it-ops/export-live-inventory-and-drift.sh new file mode 100755 index 0000000..9c681a0 --- /dev/null +++ b/scripts/it-ops/export-live-inventory-and-drift.sh @@ -0,0 +1,51 @@ +#!/usr/bin/env bash +# Live Proxmox guest inventory + drift vs config/ip-addresses.conf. +# Usage: bash scripts/it-ops/export-live-inventory-and-drift.sh +# Requires: SSH key root@SEED, python3 locally and on PVE. +set -euo pipefail +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +PROJECT_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)" +# shellcheck source=/dev/null +source "${PROJECT_ROOT}/config/ip-addresses.conf" 2>/dev/null || true +SEED="${SEED_HOST:-${PROXMOX_HOST_R630_01:-192.168.11.11}}" +OUT_DIR="${OUT_DIR:-${PROJECT_ROOT}/reports/status}" +TS="$(date +%Y%m%d_%H%M%S)" +TMP="${TMPDIR:-/tmp}/live_inv_${TS}.json" +PY="${SCRIPT_DIR}/lib/collect_inventory_remote.py" + +mkdir -p "$OUT_DIR" + +stub_unreachable() { + python3 - <<'PY' +import json +from datetime import datetime, timezone +print(json.dumps({ + "collected_at": datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ"), + "error": "seed_unreachable", + "guests": [], +}, indent=2)) +PY +} + +if ! ping -c1 -W2 "$SEED" >/dev/null 2>&1; then + stub_unreachable >"$TMP" +else + if ! ssh -o BatchMode=yes -o ConnectTimeout=15 -o StrictHostKeyChecking=no \ + "root@${SEED}" "python3 -" <"$PY" >"$TMP" 2>/dev/null; then + stub_unreachable >"$TMP" + fi +fi + +set +e +python3 "${SCRIPT_DIR}/compute_ipam_drift.py" --live "$TMP" \ + --ip-conf "${PROJECT_ROOT}/config/ip-addresses.conf" \ + --all-vmids-md "${PROJECT_ROOT}/docs/04-configuration/ALL_VMIDS_ENDPOINTS.md" \ + --out-dir "$OUT_DIR" +DRIFT_RC=$? +set -e + +cp -f "$OUT_DIR/live_inventory.json" "${OUT_DIR}/live_inventory_${TS}.json" 2>/dev/null || true +cp -f "$OUT_DIR/drift.json" "${OUT_DIR}/drift_${TS}.json" 2>/dev/null || true +rm -f "$TMP" +echo "Latest: ${OUT_DIR}/live_inventory.json , ${OUT_DIR}/drift.json" +exit "${DRIFT_RC}" diff --git a/scripts/it-ops/lib/collect_inventory_remote.py b/scripts/it-ops/lib/collect_inventory_remote.py new file mode 100755 index 0000000..da5e184 --- /dev/null +++ b/scripts/it-ops/lib/collect_inventory_remote.py @@ -0,0 +1,109 @@ +#!/usr/bin/env python3 +"""Run ON a Proxmox cluster node (as root). Stdout: JSON live guest inventory.""" +from __future__ import annotations + +import json +import re +import subprocess +import sys +from datetime import datetime, timezone + + +def _run(cmd: list[str]) -> str: + return subprocess.check_output(cmd, text=True, stderr=subprocess.DEVNULL) + + +def _extract_ip_from_net_line(line: str) -> str | None: + m = re.search(r"ip=([0-9.]+)", line) + return m.group(1) if m else None + + +def _read_config(path: str) -> str: + try: + with open(path, encoding="utf-8", errors="replace") as f: + return f.read() + except OSError: + return "" + + +def main() -> None: + collected_at = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ") + try: + raw = _run( + ["pvesh", "get", "/cluster/resources", "--output-format", "json"] + ) + resources = json.loads(raw) + except (subprocess.CalledProcessError, json.JSONDecodeError) as e: + json.dump( + { + "collected_at": collected_at, + "error": f"pvesh_cluster_resources_failed: {e}", + "guests": [], + }, + sys.stdout, + indent=2, + ) + return + + guests: list[dict] = [] + for r in resources: + t = r.get("type") + if t not in ("lxc", "qemu"): + continue + vmid = r.get("vmid") + node = r.get("node") + if vmid is None or not node: + continue + vmid_s = str(vmid) + name = r.get("name") or "" + status = r.get("status") or "" + + if t == "lxc": + cfg_path = f"/etc/pve/nodes/{node}/lxc/{vmid_s}.conf" + else: + cfg_path = f"/etc/pve/nodes/{node}/qemu-server/{vmid_s}.conf" + + body = _read_config(cfg_path) + ip = "" + for line in body.splitlines(): + if line.startswith("net0:"): + got = _extract_ip_from_net_line(line) + if got: + ip = got + break + if not ip and t == "qemu": + for line in body.splitlines(): + if line.startswith("ipconfig0:"): + got = _extract_ip_from_net_line(line) + if got: + ip = got + break + if not ip and t == "qemu": + for line in body.splitlines(): + if line.startswith("net0:"): + got = _extract_ip_from_net_line(line) + if got: + ip = got + break + + guests.append( + { + "vmid": vmid_s, + "type": t, + "node": str(node), + "name": name, + "status": status, + "ip": ip, + "config_path": cfg_path, + } + ) + + out = { + "collected_at": collected_at, + "guests": sorted(guests, key=lambda g: int(g["vmid"])), + } + json.dump(out, sys.stdout, indent=2) + + +if __name__ == "__main__": + main() diff --git a/services/sankofa-it-read-api/README.md b/services/sankofa-it-read-api/README.md new file mode 100644 index 0000000..c3f072d --- /dev/null +++ b/services/sankofa-it-read-api/README.md @@ -0,0 +1,38 @@ +# Sankofa IT read API (Phase 0) + +Minimal **read-only** JSON service for `reports/status/live_inventory.json` and `drift.json`. Intended to run on a **LAN** host (or CT) with access to the repo checkout and optional SSH to Proxmox for refresh. + +## Run + +```bash +cd /path/to/proxmox +python3 services/sankofa-it-read-api/server.py +``` + +With API key protection for `/v1/*`: + +```bash +export IT_READ_API_KEY='your-long-random-secret' +python3 services/sankofa-it-read-api/server.py +``` + +Clients send `X-API-Key: your-long-random-secret` on `/v1/inventory/*`. `/health` stays unauthenticated. + +## Endpoints + +| Method | Path | Description | +|--------|------|-------------| +| GET | `/health` | Liveness + paths | +| GET | `/v1/inventory/live` | Latest live guest inventory | +| GET | `/v1/inventory/drift` | Latest drift report | +| POST | `/v1/inventory/refresh` | Runs `scripts/it-ops/export-live-inventory-and-drift.sh` (requires `IT_READ_API_KEY`) | + +Optional **`IT_READ_API_CORS_ORIGINS`**: comma-separated browser origins; enables `OPTIONS` and `Access-Control-Allow-*` for direct SPA calls (prefer Next.js `/api/it/*` proxy so keys stay server-side). + +## systemd + +See [`config/systemd/sankofa-it-read-api.service.example`](../../config/systemd/sankofa-it-read-api.service.example). + +## Next (full BFF) + +Replace with OIDC-validated service, Postgres, and Proxmox/UniFi adapters per [SANKOFA_IT_OPERATIONS_CONTROLLER_SPEC.md](../../docs/02-architecture/SANKOFA_IT_OPERATIONS_CONTROLLER_SPEC.md). diff --git a/services/sankofa-it-read-api/server.py b/services/sankofa-it-read-api/server.py new file mode 100755 index 0000000..3f3faf0 --- /dev/null +++ b/services/sankofa-it-read-api/server.py @@ -0,0 +1,188 @@ +#!/usr/bin/env python3 +""" +Read-only HTTP API for IT inventory JSON (Phase 0 BFF stub). + +Serves latest reports/status/live_inventory.json and drift.json from the repo tree. +Optional IT_READ_API_KEY: when set, /v1/* requires header X-API-Key (GET and POST). + +Usage (from repo root): + IT_READ_API_KEY=secret python3 services/sankofa-it-read-api/server.py + # or + python3 services/sankofa-it-read-api/server.py # open /v1 without key + +Env: + IT_READ_API_HOST (default 127.0.0.1) + IT_READ_API_PORT (default 8787) + IT_READ_API_KEY (optional) + IT_READ_API_CORS_ORIGINS (optional, comma-separated; enables CORS for browser direct calls) +""" +from __future__ import annotations + +import json +import os +import subprocess +import sys +from http.server import BaseHTTPRequestHandler, ThreadingHTTPServer +from pathlib import Path + + +def _project_root() -> Path: + here = Path(__file__).resolve() + for p in [here.parent.parent.parent, *here.parents]: + if (p / "config" / "ip-addresses.conf").is_file(): + return p + return Path.cwd() + + +ROOT = _project_root() +REPORTS = ROOT / "reports" / "status" +EXPORT_SCRIPT = ROOT / "scripts" / "it-ops" / "export-live-inventory-and-drift.sh" +API_KEY = os.environ.get("IT_READ_API_KEY", "").strip() +HOST = os.environ.get("IT_READ_API_HOST", "127.0.0.1") +PORT = int(os.environ.get("IT_READ_API_PORT", "8787")) +# Comma-separated origins for Access-Control-Allow-Origin (optional; portal should proxy via Next.js). +_CORS_RAW = os.environ.get("IT_READ_API_CORS_ORIGINS", "").strip() +CORS_ORIGINS = {o.strip() for o in _CORS_RAW.split(",") if o.strip()} + + +class Handler(BaseHTTPRequestHandler): + server_version = "SankofaITReadAPI/0.1" + + def log_message(self, fmt: str, *args) -> None: + sys.stderr.write("%s - %s\n" % (self.address_string(), fmt % args)) + + def _maybe_cors(self) -> None: + if not CORS_ORIGINS: + return + origin = (self.headers.get("Origin") or "").strip() + if origin in CORS_ORIGINS: + self.send_header("Access-Control-Allow-Origin", origin) + self.send_header("Access-Control-Allow-Methods", "GET, POST, OPTIONS") + self.send_header( + "Access-Control-Allow-Headers", + "Content-Type, X-API-Key", + ) + self.send_header("Vary", "Origin") + + def do_OPTIONS(self) -> None: + if CORS_ORIGINS: + self.send_response(204) + self._maybe_cors() + self.end_headers() + return + self._text(404, "Not found\n") + + def _json(self, code: int, obj: object) -> None: + body = json.dumps(obj, indent=2).encode("utf-8") + self.send_response(code) + self.send_header("Content-Type", "application/json") + self._maybe_cors() + self.send_header("Content-Length", str(len(body))) + self.end_headers() + self.wfile.write(body) + + def _text(self, code: int, text: str, ctype: str = "text/plain") -> None: + body = text.encode("utf-8") + self.send_response(code) + self.send_header("Content-Type", f"{ctype}; charset=utf-8") + self._maybe_cors() + self.send_header("Content-Length", str(len(body))) + self.end_headers() + self.wfile.write(body) + + def _auth_ok(self) -> bool: + if not API_KEY: + return True + return self.headers.get("X-API-Key") == API_KEY + + def do_GET(self) -> None: + path = self.path.split("?", 1)[0].rstrip("/") or "/" + if path == "/health": + self._json( + 200, + { + "ok": True, + "service": "sankofa-it-read-api", + "project_root": str(ROOT), + "auth_required_for_v1": bool(API_KEY), + }, + ) + return + + if path.startswith("/v1"): + if not self._auth_ok(): + self._json(401, {"error": "unauthorized"}) + return + if path == "/v1/inventory/live": + f = REPORTS / "live_inventory.json" + elif path == "/v1/inventory/drift": + f = REPORTS / "drift.json" + else: + self._json(404, {"error": "not_found"}) + return + if not f.is_file(): + self._json(404, {"error": "file_missing", "path": str(f)}) + return + try: + data = json.loads(f.read_text(encoding="utf-8")) + except json.JSONDecodeError as e: + self._json(500, {"error": "invalid_json", "detail": str(e)}) + return + self._json(200, data) + return + + self._text(404, "Not found. GET /health or /v1/inventory/live\n") + + def do_POST(self) -> None: + path = self.path.split("?", 1)[0].rstrip("/") or "/" + if path != "/v1/inventory/refresh": + self._json(404, {"error": "not_found"}) + return + if not API_KEY or not self._auth_ok(): + self._json(401, {"error": "unauthorized"}) + return + if not EXPORT_SCRIPT.is_file(): + self._json(500, {"error": "export_script_missing"}) + return + try: + subprocess.run( + ["bash", str(EXPORT_SCRIPT)], + cwd=str(ROOT), + check=True, + timeout=600, + capture_output=True, + text=True, + ) + except subprocess.CalledProcessError as e: + self._json( + 500, + { + "error": "export_failed", + "returncode": e.returncode, + "stderr": (e.stderr or "")[-4000:], + }, + ) + return + except subprocess.TimeoutExpired: + self._json(500, {"error": "export_timeout"}) + return + self._json(200, {"ok": True, "refreshed": True}) + + +def main() -> None: + httpd = ThreadingHTTPServer((HOST, PORT), Handler) + print(f"sankofa-it-read-api listening on http://{HOST}:{PORT}", file=sys.stderr) + print(f" project_root={ROOT}", file=sys.stderr) + print(f" GET /health GET /v1/inventory/live GET /v1/inventory/drift", file=sys.stderr) + if API_KEY: + print(" X-API-Key required for /v1/*", file=sys.stderr) + if CORS_ORIGINS: + print(f" CORS origins: {sorted(CORS_ORIGINS)}", file=sys.stderr) + try: + httpd.serve_forever() + except KeyboardInterrupt: + print("\nshutdown", file=sys.stderr) + + +if __name__ == "__main__": + main()