Initial commit: add .gitignore and README

This commit is contained in:
defiQUG
2026-02-09 21:51:46 -08:00
commit b970b4fc51
52 changed files with 3362 additions and 0 deletions

View File

@@ -0,0 +1,101 @@
# Prometheus Alerting Rules
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
name: shared-services-alerts
namespace: monitoring
labels:
app: prometheus
spec:
groups:
- name: kubernetes.rules
interval: 30s
rules:
- alert: PodCrashLooping
expr: rate(kube_pod_container_status_restarts_total[15m]) > 0
for: 5m
labels:
severity: warning
annotations:
summary: "Pod {{ $labels.namespace }}/{{ $labels.pod }} is crash looping"
description: "Pod {{ $labels.namespace }}/{{ $labels.pod }} has restarted {{ $value }} times in the last 15 minutes"
- alert: PodNotReady
expr: sum by (namespace, pod) (kube_pod_status_phase{phase!~"Running|Succeeded"}) > 0
for: 10m
labels:
severity: warning
annotations:
summary: "Pod {{ $labels.namespace }}/{{ $labels.pod }} is not ready"
description: "Pod {{ $labels.namespace }}/{{ $labels.pod }} has been in a non-ready state for more than 10 minutes"
- alert: HighMemoryUsage
expr: (sum by (namespace, pod) (container_memory_usage_bytes) / sum by (namespace, pod) (kube_pod_container_resource_limits_memory_bytes)) > 0.9
for: 5m
labels:
severity: warning
annotations:
summary: "Pod {{ $labels.namespace }}/{{ $labels.pod }} memory usage is high"
description: "Pod {{ $labels.namespace }}/{{ $labels.pod }} is using {{ $value | humanizePercentage }} of its memory limit"
- alert: HighCPUUsage
expr: (sum by (namespace, pod) (rate(container_cpu_usage_seconds_total[5m])) / sum by (namespace, pod) (kube_pod_container_resource_limits_cpu)) > 0.9
for: 5m
labels:
severity: warning
annotations:
summary: "Pod {{ $labels.namespace }}/{{ $labels.pod }} CPU usage is high"
description: "Pod {{ $labels.namespace }}/{{ $labels.pod }} is using {{ $value | humanizePercentage }} of its CPU limit"
- name: application.rules
interval: 30s
rules:
- alert: HighErrorRate
expr: rate(http_requests_total{status=~"5.."}[5m]) > 0.1
for: 5m
labels:
severity: critical
annotations:
summary: "High error rate detected"
description: "Error rate is {{ $value }} errors per second"
- alert: HighLatency
expr: histogram_quantile(0.95, rate(http_request_duration_seconds_bucket[5m])) > 1
for: 10m
labels:
severity: warning
annotations:
summary: "High latency detected"
description: "95th percentile latency is {{ $value }} seconds"
- alert: ServiceDown
expr: up{job=~".+"} == 0
for: 1m
labels:
severity: critical
annotations:
summary: "Service {{ $labels.job }} is down"
description: "Service {{ $labels.job }} has been down for more than 1 minute"
- name: infrastructure.rules
interval: 30s
rules:
- alert: NodeNotReady
expr: kube_node_status_condition{condition="Ready",status="true"} == 0
for: 5m
labels:
severity: critical
annotations:
summary: "Node {{ $labels.node }} is not ready"
description: "Node {{ $labels.node }} has been in a not-ready state for more than 5 minutes"
- alert: DiskSpaceLow
expr: (node_filesystem_avail_bytes / node_filesystem_size_bytes) < 0.1
for: 5m
labels:
severity: warning
annotations:
summary: "Disk space low on {{ $labels.instance }}"
description: "Disk {{ $labels.device }} on {{ $labels.instance }} has only {{ $value | humanizePercentage }} space available"

View File

@@ -0,0 +1,50 @@
# Event Bus Monitoring Rules
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
name: event-bus-alerts
namespace: event-bus
labels:
app: nats
spec:
groups:
- name: nats.rules
interval: 30s
rules:
- alert: NATSConnectionFailure
expr: nats_connz_connections{state!="OPEN"} > 0
for: 5m
labels:
severity: warning
annotations:
summary: "NATS connection failure detected"
description: "NATS has {{ $value }} non-open connections"
- alert: NATSHighMessageRate
expr: rate(nats_varz_in_msgs[5m]) > 10000
for: 10m
labels:
severity: warning
annotations:
summary: "High NATS message rate"
description: "NATS is processing {{ $value }} messages per second"
- alert: NATSJetStreamStorageFull
expr: (nats_jetstream_varz_store_bytes / nats_jetstream_varz_store_max_bytes) > 0.9
for: 5m
labels:
severity: critical
annotations:
summary: "NATS JetStream storage nearly full"
description: "JetStream storage is {{ $value | humanizePercentage }} full"
- alert: NATSPodDown
expr: up{job="nats"} == 0
for: 1m
labels:
severity: critical
annotations:
summary: "NATS pod is down"
description: "NATS pod has been down for more than 1 minute"

View File

@@ -0,0 +1,19 @@
#!/bin/bash
# Setup event monitoring
set -e
echo "📡 Setting up event monitoring..."
# Apply Prometheus rules
echo "📊 Applying Prometheus rules for event bus..."
kubectl apply -f prometheus-rules.yaml
echo "✅ Event monitoring configured!"
echo ""
echo "📝 Monitoring:"
echo " - NATS connection metrics"
echo " - Message rate metrics"
echo " - JetStream storage metrics"
echo " - Pod health metrics"

36
monitoring/loki/install.sh Executable file
View File

@@ -0,0 +1,36 @@
#!/bin/bash
# Install Loki Stack for Logging
set -e
NAMESPACE="monitoring"
RELEASE_NAME="loki"
echo "📊 Installing Loki Stack..."
# Check prerequisites
command -v helm >/dev/null 2>&1 || { echo "❌ helm not found"; exit 1; }
command -v kubectl >/dev/null 2>&1 || { echo "❌ kubectl not found"; exit 1; }
# Create namespace
echo "📦 Creating namespace: $NAMESPACE"
kubectl create namespace "$NAMESPACE" --dry-run=client -o yaml | kubectl apply -f -
# Add Grafana Helm repo
echo "📥 Adding Grafana Helm repository..."
helm repo add grafana https://grafana.github.io/helm-charts
helm repo update
# Install Loki Stack
echo "🚀 Installing Loki Stack..."
helm upgrade --install "$RELEASE_NAME" grafana/loki-stack \
--namespace "$NAMESPACE" \
--create-namespace \
--values values.yaml \
--wait
echo "✅ Loki Stack installed successfully!"
echo ""
echo "📝 Access Grafana:"
echo " kubectl port-forward -n $NAMESPACE svc/$RELEASE_NAME-grafana 3000:80"

View File

@@ -0,0 +1,87 @@
# Loki Stack Helm Values
loki:
enabled: true
persistence:
enabled: true
storageClassName: standard
size: 50Gi
resources:
requests:
memory: 1Gi
cpu: 500m
limits:
memory: 2Gi
cpu: 1000m
config:
schema_config:
configs:
- from: 2020-10-24
store: boltdb-shipper
object_store: filesystem
schema: v11
index:
prefix: index_
period: 24h
storage_config:
boltdb_shipper:
active_index_directory: /loki/boltdb-shipper-active
cache_location: /loki/boltdb-shipper-cache
shared_store: filesystem
filesystem:
directory: /loki/chunks
limits_config:
ingestion_rate_mb: 10
ingestion_burst_size_mb: 20
max_query_length: 0h
max_query_parallelism: 32
max_streams_per_user: 10000
promtail:
enabled: true
config:
clients:
- url: http://loki:3100/loki/api/v1/push
scrape_configs:
- job_name: kubernetes-pods
kubernetes_sd_configs:
- role: pod
relabel_configs:
- source_labels:
- __meta_kubernetes_pod_controller_name
regex: ([0-9a-z-.]+?)(-[0-9a-f]{8,10})?
action: replace
target_label: __tmp_controller_name
- source_labels:
- __meta_kubernetes_pod_label_app_kubernetes_io_name
- __meta_kubernetes_pod_label_app
- __tmp_controller_name
- __meta_kubernetes_pod_name
regex: ^;*([^;]+)(;.*)?$
action: replace
target_label: app
- source_labels:
- __meta_kubernetes_pod_label_app_kubernetes_io_instance
- __meta_kubernetes_pod_label_release
regex: ^;*([^;]+)(;.*)?$
action: replace
target_label: instance
- source_labels:
- __meta_kubernetes_pod_label_app_kubernetes_io_component
- __meta_kubernetes_pod_label_component
regex: ^;*([^;]+)(;.*)?$
action: replace
target_label: component
grafana:
enabled: true
datasources:
datasources.yaml:
apiVersion: 1
datasources:
- name: Loki
type: loki
access: proxy
url: http://loki:3100
isDefault: false

View File

@@ -0,0 +1,151 @@
{
"dashboard": {
"title": "Integration & Streamlining Success Metrics",
"tags": ["metrics", "success", "integration"],
"timezone": "browser",
"panels": [
{
"id": 1,
"title": "Infrastructure Cost Reduction",
"type": "stat",
"targets": [
{
"expr": "infrastructure_cost_reduction_percent",
"legendFormat": "Cost Reduction"
}
],
"fieldConfig": {
"defaults": {
"unit": "percent",
"thresholds": {
"steps": [
{ "value": 0, "color": "red" },
{ "value": 30, "color": "yellow" },
{ "value": 40, "color": "green" }
]
}
}
}
},
{
"id": 2,
"title": "Shared Infrastructure Adoption",
"type": "stat",
"targets": [
{
"expr": "shared_infrastructure_adoption_percent",
"legendFormat": "Adoption"
}
],
"fieldConfig": {
"defaults": {
"unit": "percent",
"thresholds": {
"steps": [
{ "value": 0, "color": "red" },
{ "value": 60, "color": "yellow" },
{ "value": 80, "color": "green" }
]
}
}
}
},
{
"id": 3,
"title": "Shared Packages Usage",
"type": "stat",
"targets": [
{
"expr": "shared_packages_count",
"legendFormat": "Packages"
}
],
"fieldConfig": {
"defaults": {
"unit": "short",
"thresholds": {
"steps": [
{ "value": 0, "color": "red" },
{ "value": 7, "color": "yellow" },
{ "value": 10, "color": "green" }
]
}
}
}
},
{
"id": 4,
"title": "Deployment Time Reduction",
"type": "stat",
"targets": [
{
"expr": "deployment_time_reduction_percent",
"legendFormat": "Time Reduction"
}
],
"fieldConfig": {
"defaults": {
"unit": "percent",
"thresholds": {
"steps": [
{ "value": 0, "color": "red" },
{ "value": 30, "color": "yellow" },
{ "value": 50, "color": "green" }
]
}
}
}
},
{
"id": 5,
"title": "CI/CD Adoption",
"type": "stat",
"targets": [
{
"expr": "cicd_adoption_percent",
"legendFormat": "Adoption"
}
],
"fieldConfig": {
"defaults": {
"unit": "percent",
"thresholds": {
"steps": [
{ "value": 0, "color": "red" },
{ "value": 70, "color": "yellow" },
{ "value": 90, "color": "green" }
]
}
}
}
},
{
"id": 6,
"title": "Service Uptime",
"type": "stat",
"targets": [
{
"expr": "service_uptime_percent",
"legendFormat": "Uptime"
}
],
"fieldConfig": {
"defaults": {
"unit": "percent",
"thresholds": {
"steps": [
{ "value": 0, "color": "red" },
{ "value": 99, "color": "yellow" },
{ "value": 99.9, "color": "green" }
]
}
}
}
}
],
"refresh": "30s",
"schemaVersion": 27,
"version": 1
}
}

View File

@@ -0,0 +1,32 @@
#!/bin/bash
# Setup metrics dashboard in Grafana
set -e
NAMESPACE="monitoring"
DASHBOARD_FILE="grafana-dashboard.json"
echo "📊 Setting up Metrics Dashboard in Grafana..."
# Check if Grafana is accessible
if ! kubectl get svc -n "$NAMESPACE" | grep -q grafana; then
echo "⚠️ Grafana not found in namespace $NAMESPACE"
echo " → Deploy Grafana first: cd ../prometheus && ./install.sh"
exit 1
fi
# Create ConfigMap with dashboard
echo "📝 Creating dashboard ConfigMap..."
kubectl create configmap metrics-dashboard \
--from-file=dashboard.json="$DASHBOARD_FILE" \
-n "$NAMESPACE" \
--dry-run=client -o yaml | kubectl apply -f -
echo "✅ Metrics dashboard configured!"
echo ""
echo "📝 Next steps:"
echo " 1. Access Grafana: kubectl port-forward -n $NAMESPACE svc/prometheus-grafana 3000:80"
echo " 2. Import dashboard from ConfigMap"
echo " 3. Configure data sources"
echo " 4. Set up metrics collection"

View File

@@ -0,0 +1,47 @@
#!/bin/bash
# Install Prometheus/Grafana Stack
set -e
NAMESPACE="monitoring"
RELEASE_NAME="prometheus"
echo "📊 Installing Prometheus/Grafana Stack..."
# Check if helm is installed
if ! command -v helm &> /dev/null; then
echo "❌ Helm not found. Please install Helm first."
exit 1
fi
# Check if kubectl is installed
if ! command -v kubectl &> /dev/null; then
echo "❌ kubectl not found. Please install kubectl first."
exit 1
fi
# Create namespace
echo "📦 Creating namespace: $NAMESPACE"
kubectl create namespace "$NAMESPACE" --dry-run=client -o yaml | kubectl apply -f -
# Add Prometheus Helm repo
echo "📥 Adding Prometheus Helm repository..."
helm repo add prometheus-community https://prometheus-community.github.io/helm-charts
helm repo update
# Install Prometheus Stack
echo "🚀 Installing Prometheus Stack..."
helm upgrade --install "$RELEASE_NAME" prometheus-community/kube-prometheus-stack \
--namespace "$NAMESPACE" \
--create-namespace \
--values values.yaml \
--wait
echo "✅ Prometheus/Grafana Stack installed successfully!"
echo ""
echo "📝 Access Grafana:"
echo " kubectl port-forward -n $NAMESPACE svc/$RELEASE_NAME-grafana 3000:80"
echo ""
echo "📝 Access Prometheus:"
echo " kubectl port-forward -n $NAMESPACE svc/$RELEASE_NAME-kube-prom-prometheus 9090:9090"

View File

@@ -0,0 +1,86 @@
# Prometheus Stack Helm Values
# For use with kube-prometheus-stack
prometheus:
prometheusSpec:
retention: 30d
retentionSize: 50GB
storageSpec:
volumeClaimTemplate:
spec:
accessModes: ["ReadWriteOnce"]
storageClassName: standard
resources:
requests:
storage: 100Gi
resources:
requests:
memory: 2Gi
cpu: 1000m
limits:
memory: 4Gi
cpu: 2000m
serviceMonitorSelectorNilUsesHelmValues: false
podMonitorSelectorNilUsesHelmValues: false
ruleSelectorNilUsesHelmValues: false
grafana:
enabled: true
adminPassword: "admin" # Change in production
persistence:
enabled: true
size: 10Gi
resources:
requests:
memory: 256Mi
cpu: 100m
limits:
memory: 512Mi
cpu: 500m
dashboardProviders:
dashboardproviders.yaml:
apiVersion: 1
providers:
- name: 'default'
orgId: 1
folder: ''
type: file
disableDelete: false
editable: true
options:
path: /var/lib/grafana/dashboards/default
dashboards:
default:
kubernetes-cluster:
gnetId: 7249
revision: 1
datasource: Prometheus
alertmanager:
enabled: true
config:
global:
resolve_timeout: 5m
route:
group_by: ['alertname']
group_wait: 10s
group_interval: 10s
repeat_interval: 12h
receiver: 'default'
receivers:
- name: 'default'
email_configs:
- to: 'alerts@example.com'
from: 'prometheus@example.com'
smarthost: 'smtp.example.com:587'
auth_username: 'prometheus'
auth_password: 'password'
headers:
Subject: 'Prometheus Alert'
kubeStateMetrics:
enabled: true
nodeExporter:
enabled: true