Initial commit: add .gitignore and README

2026-02-09 21:51:46 -08:00
commit b970b4fc51
52 changed files with 3362 additions and 0 deletions
--- a/monitoring/alerts/prometheus-rules.yaml
+++ b/monitoring/alerts/prometheus-rules.yaml
@@ -0,0 +1,101 @@
+# Prometheus Alerting Rules
+
+apiVersion: monitoring.coreos.com/v1
+kind: PrometheusRule
+metadata:
+  name: shared-services-alerts
+  namespace: monitoring
+  labels:
+    app: prometheus
+spec:
+  groups:
+    - name: kubernetes.rules
+      interval: 30s
+      rules:
+        - alert: PodCrashLooping
+          expr: rate(kube_pod_container_status_restarts_total[15m]) > 0
+          for: 5m
+          labels:
+            severity: warning
+          annotations:
+            summary: "Pod {{ $labels.namespace }}/{{ $labels.pod }} is crash looping"
+            description: "Pod {{ $labels.namespace }}/{{ $labels.pod }} has restarted {{ $value }} times in the last 15 minutes"
+
+        - alert: PodNotReady
+          expr: sum by (namespace, pod) (kube_pod_status_phase{phase!~"Running|Succeeded"}) > 0
+          for: 10m
+          labels:
+            severity: warning
+          annotations:
+            summary: "Pod {{ $labels.namespace }}/{{ $labels.pod }} is not ready"
+            description: "Pod {{ $labels.namespace }}/{{ $labels.pod }} has been in a non-ready state for more than 10 minutes"
+
+        - alert: HighMemoryUsage
+          expr: (sum by (namespace, pod) (container_memory_usage_bytes) / sum by (namespace, pod) (kube_pod_container_resource_limits_memory_bytes)) > 0.9
+          for: 5m
+          labels:
+            severity: warning
+          annotations:
+            summary: "Pod {{ $labels.namespace }}/{{ $labels.pod }} memory usage is high"
+            description: "Pod {{ $labels.namespace }}/{{ $labels.pod }} is using {{ $value | humanizePercentage }} of its memory limit"
+
+        - alert: HighCPUUsage
+          expr: (sum by (namespace, pod) (rate(container_cpu_usage_seconds_total[5m])) / sum by (namespace, pod) (kube_pod_container_resource_limits_cpu)) > 0.9
+          for: 5m
+          labels:
+            severity: warning
+          annotations:
+            summary: "Pod {{ $labels.namespace }}/{{ $labels.pod }} CPU usage is high"
+            description: "Pod {{ $labels.namespace }}/{{ $labels.pod }} is using {{ $value | humanizePercentage }} of its CPU limit"
+
+    - name: application.rules
+      interval: 30s
+      rules:
+        - alert: HighErrorRate
+          expr: rate(http_requests_total{status=~"5.."}[5m]) > 0.1
+          for: 5m
+          labels:
+            severity: critical
+          annotations:
+            summary: "High error rate detected"
+            description: "Error rate is {{ $value }} errors per second"
+
+        - alert: HighLatency
+          expr: histogram_quantile(0.95, rate(http_request_duration_seconds_bucket[5m])) > 1
+          for: 10m
+          labels:
+            severity: warning
+          annotations:
+            summary: "High latency detected"
+            description: "95th percentile latency is {{ $value }} seconds"
+
+        - alert: ServiceDown
+          expr: up{job=~".+"} == 0
+          for: 1m
+          labels:
+            severity: critical
+          annotations:
+            summary: "Service {{ $labels.job }} is down"
+            description: "Service {{ $labels.job }} has been down for more than 1 minute"
+
+    - name: infrastructure.rules
+      interval: 30s
+      rules:
+        - alert: NodeNotReady
+          expr: kube_node_status_condition{condition="Ready",status="true"} == 0
+          for: 5m
+          labels:
+            severity: critical
+          annotations:
+            summary: "Node {{ $labels.node }} is not ready"
+            description: "Node {{ $labels.node }} has been in a not-ready state for more than 5 minutes"
+
+        - alert: DiskSpaceLow
+          expr: (node_filesystem_avail_bytes / node_filesystem_size_bytes) < 0.1
+          for: 5m
+          labels:
+            severity: warning
+          annotations:
+            summary: "Disk space low on {{ $labels.instance }}"
+            description: "Disk {{ $labels.device }} on {{ $labels.instance }} has only {{ $value | humanizePercentage }} space available"
+
--- a/monitoring/event-monitoring/prometheus-rules.yaml
+++ b/monitoring/event-monitoring/prometheus-rules.yaml
@@ -0,0 +1,50 @@
+# Event Bus Monitoring Rules
+
+apiVersion: monitoring.coreos.com/v1
+kind: PrometheusRule
+metadata:
+  name: event-bus-alerts
+  namespace: event-bus
+  labels:
+    app: nats
+spec:
+  groups:
+    - name: nats.rules
+      interval: 30s
+      rules:
+        - alert: NATSConnectionFailure
+          expr: nats_connz_connections{state!="OPEN"} > 0
+          for: 5m
+          labels:
+            severity: warning
+          annotations:
+            summary: "NATS connection failure detected"
+            description: "NATS has {{ $value }} non-open connections"
+
+        - alert: NATSHighMessageRate
+          expr: rate(nats_varz_in_msgs[5m]) > 10000
+          for: 10m
+          labels:
+            severity: warning
+          annotations:
+            summary: "High NATS message rate"
+            description: "NATS is processing {{ $value }} messages per second"
+
+        - alert: NATSJetStreamStorageFull
+          expr: (nats_jetstream_varz_store_bytes / nats_jetstream_varz_store_max_bytes) > 0.9
+          for: 5m
+          labels:
+            severity: critical
+          annotations:
+            summary: "NATS JetStream storage nearly full"
+            description: "JetStream storage is {{ $value | humanizePercentage }} full"
+
+        - alert: NATSPodDown
+          expr: up{job="nats"} == 0
+          for: 1m
+          labels:
+            severity: critical
+          annotations:
+            summary: "NATS pod is down"
+            description: "NATS pod has been down for more than 1 minute"
+
--- a/monitoring/event-monitoring/setup.sh
+++ b/monitoring/event-monitoring/setup.sh
@@ -0,0 +1,19 @@
+#!/bin/bash
+# Setup event monitoring
+
+set -e
+
+echo "📡 Setting up event monitoring..."
+
+# Apply Prometheus rules
+echo "📊 Applying Prometheus rules for event bus..."
+kubectl apply -f prometheus-rules.yaml
+
+echo "✅ Event monitoring configured!"
+echo ""
+echo "📝 Monitoring:"
+echo "   - NATS connection metrics"
+echo "   - Message rate metrics"
+echo "   - JetStream storage metrics"
+echo "   - Pod health metrics"
+
--- a/monitoring/loki/install.sh
+++ b/monitoring/loki/install.sh
@@ -0,0 +1,36 @@
+#!/bin/bash
+# Install Loki Stack for Logging
+
+set -e
+
+NAMESPACE="monitoring"
+RELEASE_NAME="loki"
+
+echo "📊 Installing Loki Stack..."
+
+# Check prerequisites
+command -v helm >/dev/null 2>&1 || { echo "❌ helm not found"; exit 1; }
+command -v kubectl >/dev/null 2>&1 || { echo "❌ kubectl not found"; exit 1; }
+
+# Create namespace
+echo "📦 Creating namespace: $NAMESPACE"
+kubectl create namespace "$NAMESPACE" --dry-run=client -o yaml | kubectl apply -f -
+
+# Add Grafana Helm repo
+echo "📥 Adding Grafana Helm repository..."
+helm repo add grafana https://grafana.github.io/helm-charts
+helm repo update
+
+# Install Loki Stack
+echo "🚀 Installing Loki Stack..."
+helm upgrade --install "$RELEASE_NAME" grafana/loki-stack \
+  --namespace "$NAMESPACE" \
+  --create-namespace \
+  --values values.yaml \
+  --wait
+
+echo "✅ Loki Stack installed successfully!"
+echo ""
+echo "📝 Access Grafana:"
+echo "   kubectl port-forward -n $NAMESPACE svc/$RELEASE_NAME-grafana 3000:80"
+
--- a/monitoring/loki/values.yaml
+++ b/monitoring/loki/values.yaml
@@ -0,0 +1,87 @@
+# Loki Stack Helm Values
+
+loki:
+  enabled: true
+  persistence:
+    enabled: true
+    storageClassName: standard
+    size: 50Gi
+  resources:
+    requests:
+      memory: 1Gi
+      cpu: 500m
+    limits:
+      memory: 2Gi
+      cpu: 1000m
+  config:
+    schema_config:
+      configs:
+        - from: 2020-10-24
+          store: boltdb-shipper
+          object_store: filesystem
+          schema: v11
+          index:
+            prefix: index_
+            period: 24h
+    storage_config:
+      boltdb_shipper:
+        active_index_directory: /loki/boltdb-shipper-active
+        cache_location: /loki/boltdb-shipper-cache
+        shared_store: filesystem
+      filesystem:
+        directory: /loki/chunks
+    limits_config:
+      ingestion_rate_mb: 10
+      ingestion_burst_size_mb: 20
+      max_query_length: 0h
+      max_query_parallelism: 32
+      max_streams_per_user: 10000
+
+promtail:
+  enabled: true
+  config:
+    clients:
+      - url: http://loki:3100/loki/api/v1/push
+    scrape_configs:
+      - job_name: kubernetes-pods
+        kubernetes_sd_configs:
+          - role: pod
+        relabel_configs:
+          - source_labels:
+              - __meta_kubernetes_pod_controller_name
+            regex: ([0-9a-z-.]+?)(-[0-9a-f]{8,10})?
+            action: replace
+            target_label: __tmp_controller_name
+          - source_labels:
+              - __meta_kubernetes_pod_label_app_kubernetes_io_name
+              - __meta_kubernetes_pod_label_app
+              - __tmp_controller_name
+              - __meta_kubernetes_pod_name
+            regex: ^;*([^;]+)(;.*)?$
+            action: replace
+            target_label: app
+          - source_labels:
+              - __meta_kubernetes_pod_label_app_kubernetes_io_instance
+              - __meta_kubernetes_pod_label_release
+            regex: ^;*([^;]+)(;.*)?$
+            action: replace
+            target_label: instance
+          - source_labels:
+              - __meta_kubernetes_pod_label_app_kubernetes_io_component
+              - __meta_kubernetes_pod_label_component
+            regex: ^;*([^;]+)(;.*)?$
+            action: replace
+            target_label: component
+
+grafana:
+  enabled: true
+  datasources:
+    datasources.yaml:
+      apiVersion: 1
+      datasources:
+        - name: Loki
+          type: loki
+          access: proxy
+          url: http://loki:3100
+          isDefault: false
+
--- a/monitoring/metrics-dashboard/grafana-dashboard.json
+++ b/monitoring/metrics-dashboard/grafana-dashboard.json
@@ -0,0 +1,151 @@
+{
+  "dashboard": {
+    "title": "Integration & Streamlining Success Metrics",
+    "tags": ["metrics", "success", "integration"],
+    "timezone": "browser",
+    "panels": [
+      {
+        "id": 1,
+        "title": "Infrastructure Cost Reduction",
+        "type": "stat",
+        "targets": [
+          {
+            "expr": "infrastructure_cost_reduction_percent",
+            "legendFormat": "Cost Reduction"
+          }
+        ],
+        "fieldConfig": {
+          "defaults": {
+            "unit": "percent",
+            "thresholds": {
+              "steps": [
+                { "value": 0, "color": "red" },
+                { "value": 30, "color": "yellow" },
+                { "value": 40, "color": "green" }
+              ]
+            }
+          }
+        }
+      },
+      {
+        "id": 2,
+        "title": "Shared Infrastructure Adoption",
+        "type": "stat",
+        "targets": [
+          {
+            "expr": "shared_infrastructure_adoption_percent",
+            "legendFormat": "Adoption"
+          }
+        ],
+        "fieldConfig": {
+          "defaults": {
+            "unit": "percent",
+            "thresholds": {
+              "steps": [
+                { "value": 0, "color": "red" },
+                { "value": 60, "color": "yellow" },
+                { "value": 80, "color": "green" }
+              ]
+            }
+          }
+        }
+      },
+      {
+        "id": 3,
+        "title": "Shared Packages Usage",
+        "type": "stat",
+        "targets": [
+          {
+            "expr": "shared_packages_count",
+            "legendFormat": "Packages"
+          }
+        ],
+        "fieldConfig": {
+          "defaults": {
+            "unit": "short",
+            "thresholds": {
+              "steps": [
+                { "value": 0, "color": "red" },
+                { "value": 7, "color": "yellow" },
+                { "value": 10, "color": "green" }
+              ]
+            }
+          }
+        }
+      },
+      {
+        "id": 4,
+        "title": "Deployment Time Reduction",
+        "type": "stat",
+        "targets": [
+          {
+            "expr": "deployment_time_reduction_percent",
+            "legendFormat": "Time Reduction"
+          }
+        ],
+        "fieldConfig": {
+          "defaults": {
+            "unit": "percent",
+            "thresholds": {
+              "steps": [
+                { "value": 0, "color": "red" },
+                { "value": 30, "color": "yellow" },
+                { "value": 50, "color": "green" }
+              ]
+            }
+          }
+        }
+      },
+      {
+        "id": 5,
+        "title": "CI/CD Adoption",
+        "type": "stat",
+        "targets": [
+          {
+            "expr": "cicd_adoption_percent",
+            "legendFormat": "Adoption"
+          }
+        ],
+        "fieldConfig": {
+          "defaults": {
+            "unit": "percent",
+            "thresholds": {
+              "steps": [
+                { "value": 0, "color": "red" },
+                { "value": 70, "color": "yellow" },
+                { "value": 90, "color": "green" }
+              ]
+            }
+          }
+        }
+      },
+      {
+        "id": 6,
+        "title": "Service Uptime",
+        "type": "stat",
+        "targets": [
+          {
+            "expr": "service_uptime_percent",
+            "legendFormat": "Uptime"
+          }
+        ],
+        "fieldConfig": {
+          "defaults": {
+            "unit": "percent",
+            "thresholds": {
+              "steps": [
+                { "value": 0, "color": "red" },
+                { "value": 99, "color": "yellow" },
+                { "value": 99.9, "color": "green" }
+              ]
+            }
+          }
+        }
+      }
+    ],
+    "refresh": "30s",
+    "schemaVersion": 27,
+    "version": 1
+  }
+}
+
--- a/monitoring/metrics-dashboard/setup.sh
+++ b/monitoring/metrics-dashboard/setup.sh
@@ -0,0 +1,32 @@
+#!/bin/bash
+# Setup metrics dashboard in Grafana
+
+set -e
+
+NAMESPACE="monitoring"
+DASHBOARD_FILE="grafana-dashboard.json"
+
+echo "📊 Setting up Metrics Dashboard in Grafana..."
+
+# Check if Grafana is accessible
+if ! kubectl get svc -n "$NAMESPACE" | grep -q grafana; then
+  echo "⚠️  Grafana not found in namespace $NAMESPACE"
+  echo "   → Deploy Grafana first: cd ../prometheus && ./install.sh"
+  exit 1
+fi
+
+# Create ConfigMap with dashboard
+echo "📝 Creating dashboard ConfigMap..."
+kubectl create configmap metrics-dashboard \
+  --from-file=dashboard.json="$DASHBOARD_FILE" \
+  -n "$NAMESPACE" \
+  --dry-run=client -o yaml | kubectl apply -f -
+
+echo "✅ Metrics dashboard configured!"
+echo ""
+echo "📝 Next steps:"
+echo "   1. Access Grafana: kubectl port-forward -n $NAMESPACE svc/prometheus-grafana 3000:80"
+echo "   2. Import dashboard from ConfigMap"
+echo "   3. Configure data sources"
+echo "   4. Set up metrics collection"
+
--- a/monitoring/prometheus/install.sh
+++ b/monitoring/prometheus/install.sh
@@ -0,0 +1,47 @@
+#!/bin/bash
+# Install Prometheus/Grafana Stack
+
+set -e
+
+NAMESPACE="monitoring"
+RELEASE_NAME="prometheus"
+
+echo "📊 Installing Prometheus/Grafana Stack..."
+
+# Check if helm is installed
+if ! command -v helm &> /dev/null; then
+  echo "❌ Helm not found. Please install Helm first."
+  exit 1
+fi
+
+# Check if kubectl is installed
+if ! command -v kubectl &> /dev/null; then
+  echo "❌ kubectl not found. Please install kubectl first."
+  exit 1
+fi
+
+# Create namespace
+echo "📦 Creating namespace: $NAMESPACE"
+kubectl create namespace "$NAMESPACE" --dry-run=client -o yaml | kubectl apply -f -
+
+# Add Prometheus Helm repo
+echo "📥 Adding Prometheus Helm repository..."
+helm repo add prometheus-community https://prometheus-community.github.io/helm-charts
+helm repo update
+
+# Install Prometheus Stack
+echo "🚀 Installing Prometheus Stack..."
+helm upgrade --install "$RELEASE_NAME" prometheus-community/kube-prometheus-stack \
+  --namespace "$NAMESPACE" \
+  --create-namespace \
+  --values values.yaml \
+  --wait
+
+echo "✅ Prometheus/Grafana Stack installed successfully!"
+echo ""
+echo "📝 Access Grafana:"
+echo "   kubectl port-forward -n $NAMESPACE svc/$RELEASE_NAME-grafana 3000:80"
+echo ""
+echo "📝 Access Prometheus:"
+echo "   kubectl port-forward -n $NAMESPACE svc/$RELEASE_NAME-kube-prom-prometheus 9090:9090"
+
--- a/monitoring/prometheus/values.yaml
+++ b/monitoring/prometheus/values.yaml
@@ -0,0 +1,86 @@
+# Prometheus Stack Helm Values
+# For use with kube-prometheus-stack
+
+prometheus:
+  prometheusSpec:
+    retention: 30d
+    retentionSize: 50GB
+    storageSpec:
+      volumeClaimTemplate:
+        spec:
+          accessModes: ["ReadWriteOnce"]
+          storageClassName: standard
+          resources:
+            requests:
+              storage: 100Gi
+    resources:
+      requests:
+        memory: 2Gi
+        cpu: 1000m
+      limits:
+        memory: 4Gi
+        cpu: 2000m
+    serviceMonitorSelectorNilUsesHelmValues: false
+    podMonitorSelectorNilUsesHelmValues: false
+    ruleSelectorNilUsesHelmValues: false
+
+grafana:
+  enabled: true
+  adminPassword: "admin" # Change in production
+  persistence:
+    enabled: true
+    size: 10Gi
+  resources:
+    requests:
+      memory: 256Mi
+      cpu: 100m
+    limits:
+      memory: 512Mi
+      cpu: 500m
+  dashboardProviders:
+    dashboardproviders.yaml:
+      apiVersion: 1
+      providers:
+      - name: 'default'
+        orgId: 1
+        folder: ''
+        type: file
+        disableDelete: false
+        editable: true
+        options:
+          path: /var/lib/grafana/dashboards/default
+  dashboards:
+    default:
+      kubernetes-cluster:
+        gnetId: 7249
+        revision: 1
+        datasource: Prometheus
+
+alertmanager:
+  enabled: true
+  config:
+    global:
+      resolve_timeout: 5m
+    route:
+      group_by: ['alertname']
+      group_wait: 10s
+      group_interval: 10s
+      repeat_interval: 12h
+      receiver: 'default'
+    receivers:
+    - name: 'default'
+      email_configs:
+      - to: 'alerts@example.com'
+        from: 'prometheus@example.com'
+        smarthost: 'smtp.example.com:587'
+        auth_username: 'prometheus'
+        auth_password: 'password'
+        headers:
+          Subject: 'Prometheus Alert'
+
+kubeStateMetrics:
+  enabled: true
+
+nodeExporter:
+  enabled: true
+