scripts/57xx-deploy/5702-inference/docker-compose.yml

# VM 5702 — Inference: llama.cpp server (CPU-friendly)
# Copy to /opt/ai/inference/ and place model at /opt/ai/inference/data/models/model.gguf
# See: docs/02-architecture/AI_AGENTS_57XX_DEPLOYMENT_PLAN.md Appendix D

services:
  llama:
    image: ghcr.io/ggerganov/llama.cpp:server
    container_name: ai-inf-prod
    volumes:
      - /opt/ai/inference/data/models:/models
    command: >
      -m /models/model.gguf
      --host 0.0.0.0 --port 8000
      --n-gpu-layers 0
      --ctx-size 4096
    ports:
      - "8000:8000"
    restart: unless-stopped
chore: sync all changes to Gitea - Config, docs, scripts, and backup manifests - Submodule refs unchanged (m = modified content in submodules) Made-with: Cursor 2026-03-02 11:37:34 -08:00			`# VM 5702 — Inference: llama.cpp server (CPU-friendly)`
			`# Copy to /opt/ai/inference/ and place model at /opt/ai/inference/data/models/model.gguf`
			`# See: docs/02-architecture/AI_AGENTS_57XX_DEPLOYMENT_PLAN.md Appendix D`

			`services:`
			`llama:`
			`image: ghcr.io/ggerganov/llama.cpp:server`
			`container_name: ai-inf-prod`
			`volumes:`
			`- /opt/ai/inference/data/models:/models`
			`command: >`
			`-m /models/model.gguf`
			`--host 0.0.0.0 --port 8000`
			`--n-gpu-layers 0`
			`--ctx-size 4096`
			`ports:`
			`- "8000:8000"`
			`restart: unless-stopped`