Initial commit: add .gitignore and README
This commit is contained in:
101
monitoring/alerts/prometheus-rules.yaml
Normal file
101
monitoring/alerts/prometheus-rules.yaml
Normal file
@@ -0,0 +1,101 @@
|
||||
# Prometheus Alerting Rules
|
||||
|
||||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: PrometheusRule
|
||||
metadata:
|
||||
name: shared-services-alerts
|
||||
namespace: monitoring
|
||||
labels:
|
||||
app: prometheus
|
||||
spec:
|
||||
groups:
|
||||
- name: kubernetes.rules
|
||||
interval: 30s
|
||||
rules:
|
||||
- alert: PodCrashLooping
|
||||
expr: rate(kube_pod_container_status_restarts_total[15m]) > 0
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Pod {{ $labels.namespace }}/{{ $labels.pod }} is crash looping"
|
||||
description: "Pod {{ $labels.namespace }}/{{ $labels.pod }} has restarted {{ $value }} times in the last 15 minutes"
|
||||
|
||||
- alert: PodNotReady
|
||||
expr: sum by (namespace, pod) (kube_pod_status_phase{phase!~"Running|Succeeded"}) > 0
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Pod {{ $labels.namespace }}/{{ $labels.pod }} is not ready"
|
||||
description: "Pod {{ $labels.namespace }}/{{ $labels.pod }} has been in a non-ready state for more than 10 minutes"
|
||||
|
||||
- alert: HighMemoryUsage
|
||||
expr: (sum by (namespace, pod) (container_memory_usage_bytes) / sum by (namespace, pod) (kube_pod_container_resource_limits_memory_bytes)) > 0.9
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Pod {{ $labels.namespace }}/{{ $labels.pod }} memory usage is high"
|
||||
description: "Pod {{ $labels.namespace }}/{{ $labels.pod }} is using {{ $value | humanizePercentage }} of its memory limit"
|
||||
|
||||
- alert: HighCPUUsage
|
||||
expr: (sum by (namespace, pod) (rate(container_cpu_usage_seconds_total[5m])) / sum by (namespace, pod) (kube_pod_container_resource_limits_cpu)) > 0.9
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Pod {{ $labels.namespace }}/{{ $labels.pod }} CPU usage is high"
|
||||
description: "Pod {{ $labels.namespace }}/{{ $labels.pod }} is using {{ $value | humanizePercentage }} of its CPU limit"
|
||||
|
||||
- name: application.rules
|
||||
interval: 30s
|
||||
rules:
|
||||
- alert: HighErrorRate
|
||||
expr: rate(http_requests_total{status=~"5.."}[5m]) > 0.1
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "High error rate detected"
|
||||
description: "Error rate is {{ $value }} errors per second"
|
||||
|
||||
- alert: HighLatency
|
||||
expr: histogram_quantile(0.95, rate(http_request_duration_seconds_bucket[5m])) > 1
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "High latency detected"
|
||||
description: "95th percentile latency is {{ $value }} seconds"
|
||||
|
||||
- alert: ServiceDown
|
||||
expr: up{job=~".+"} == 0
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "Service {{ $labels.job }} is down"
|
||||
description: "Service {{ $labels.job }} has been down for more than 1 minute"
|
||||
|
||||
- name: infrastructure.rules
|
||||
interval: 30s
|
||||
rules:
|
||||
- alert: NodeNotReady
|
||||
expr: kube_node_status_condition{condition="Ready",status="true"} == 0
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "Node {{ $labels.node }} is not ready"
|
||||
description: "Node {{ $labels.node }} has been in a not-ready state for more than 5 minutes"
|
||||
|
||||
- alert: DiskSpaceLow
|
||||
expr: (node_filesystem_avail_bytes / node_filesystem_size_bytes) < 0.1
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Disk space low on {{ $labels.instance }}"
|
||||
description: "Disk {{ $labels.device }} on {{ $labels.instance }} has only {{ $value | humanizePercentage }} space available"
|
||||
|
||||
50
monitoring/event-monitoring/prometheus-rules.yaml
Normal file
50
monitoring/event-monitoring/prometheus-rules.yaml
Normal file
@@ -0,0 +1,50 @@
|
||||
# Event Bus Monitoring Rules
|
||||
|
||||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: PrometheusRule
|
||||
metadata:
|
||||
name: event-bus-alerts
|
||||
namespace: event-bus
|
||||
labels:
|
||||
app: nats
|
||||
spec:
|
||||
groups:
|
||||
- name: nats.rules
|
||||
interval: 30s
|
||||
rules:
|
||||
- alert: NATSConnectionFailure
|
||||
expr: nats_connz_connections{state!="OPEN"} > 0
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "NATS connection failure detected"
|
||||
description: "NATS has {{ $value }} non-open connections"
|
||||
|
||||
- alert: NATSHighMessageRate
|
||||
expr: rate(nats_varz_in_msgs[5m]) > 10000
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "High NATS message rate"
|
||||
description: "NATS is processing {{ $value }} messages per second"
|
||||
|
||||
- alert: NATSJetStreamStorageFull
|
||||
expr: (nats_jetstream_varz_store_bytes / nats_jetstream_varz_store_max_bytes) > 0.9
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "NATS JetStream storage nearly full"
|
||||
description: "JetStream storage is {{ $value | humanizePercentage }} full"
|
||||
|
||||
- alert: NATSPodDown
|
||||
expr: up{job="nats"} == 0
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "NATS pod is down"
|
||||
description: "NATS pod has been down for more than 1 minute"
|
||||
|
||||
19
monitoring/event-monitoring/setup.sh
Executable file
19
monitoring/event-monitoring/setup.sh
Executable file
@@ -0,0 +1,19 @@
|
||||
#!/bin/bash
|
||||
# Setup event monitoring
|
||||
|
||||
set -e
|
||||
|
||||
echo "📡 Setting up event monitoring..."
|
||||
|
||||
# Apply Prometheus rules
|
||||
echo "📊 Applying Prometheus rules for event bus..."
|
||||
kubectl apply -f prometheus-rules.yaml
|
||||
|
||||
echo "✅ Event monitoring configured!"
|
||||
echo ""
|
||||
echo "📝 Monitoring:"
|
||||
echo " - NATS connection metrics"
|
||||
echo " - Message rate metrics"
|
||||
echo " - JetStream storage metrics"
|
||||
echo " - Pod health metrics"
|
||||
|
||||
36
monitoring/loki/install.sh
Executable file
36
monitoring/loki/install.sh
Executable file
@@ -0,0 +1,36 @@
|
||||
#!/bin/bash
|
||||
# Install Loki Stack for Logging
|
||||
|
||||
set -e
|
||||
|
||||
NAMESPACE="monitoring"
|
||||
RELEASE_NAME="loki"
|
||||
|
||||
echo "📊 Installing Loki Stack..."
|
||||
|
||||
# Check prerequisites
|
||||
command -v helm >/dev/null 2>&1 || { echo "❌ helm not found"; exit 1; }
|
||||
command -v kubectl >/dev/null 2>&1 || { echo "❌ kubectl not found"; exit 1; }
|
||||
|
||||
# Create namespace
|
||||
echo "📦 Creating namespace: $NAMESPACE"
|
||||
kubectl create namespace "$NAMESPACE" --dry-run=client -o yaml | kubectl apply -f -
|
||||
|
||||
# Add Grafana Helm repo
|
||||
echo "📥 Adding Grafana Helm repository..."
|
||||
helm repo add grafana https://grafana.github.io/helm-charts
|
||||
helm repo update
|
||||
|
||||
# Install Loki Stack
|
||||
echo "🚀 Installing Loki Stack..."
|
||||
helm upgrade --install "$RELEASE_NAME" grafana/loki-stack \
|
||||
--namespace "$NAMESPACE" \
|
||||
--create-namespace \
|
||||
--values values.yaml \
|
||||
--wait
|
||||
|
||||
echo "✅ Loki Stack installed successfully!"
|
||||
echo ""
|
||||
echo "📝 Access Grafana:"
|
||||
echo " kubectl port-forward -n $NAMESPACE svc/$RELEASE_NAME-grafana 3000:80"
|
||||
|
||||
87
monitoring/loki/values.yaml
Normal file
87
monitoring/loki/values.yaml
Normal file
@@ -0,0 +1,87 @@
|
||||
# Loki Stack Helm Values
|
||||
|
||||
loki:
|
||||
enabled: true
|
||||
persistence:
|
||||
enabled: true
|
||||
storageClassName: standard
|
||||
size: 50Gi
|
||||
resources:
|
||||
requests:
|
||||
memory: 1Gi
|
||||
cpu: 500m
|
||||
limits:
|
||||
memory: 2Gi
|
||||
cpu: 1000m
|
||||
config:
|
||||
schema_config:
|
||||
configs:
|
||||
- from: 2020-10-24
|
||||
store: boltdb-shipper
|
||||
object_store: filesystem
|
||||
schema: v11
|
||||
index:
|
||||
prefix: index_
|
||||
period: 24h
|
||||
storage_config:
|
||||
boltdb_shipper:
|
||||
active_index_directory: /loki/boltdb-shipper-active
|
||||
cache_location: /loki/boltdb-shipper-cache
|
||||
shared_store: filesystem
|
||||
filesystem:
|
||||
directory: /loki/chunks
|
||||
limits_config:
|
||||
ingestion_rate_mb: 10
|
||||
ingestion_burst_size_mb: 20
|
||||
max_query_length: 0h
|
||||
max_query_parallelism: 32
|
||||
max_streams_per_user: 10000
|
||||
|
||||
promtail:
|
||||
enabled: true
|
||||
config:
|
||||
clients:
|
||||
- url: http://loki:3100/loki/api/v1/push
|
||||
scrape_configs:
|
||||
- job_name: kubernetes-pods
|
||||
kubernetes_sd_configs:
|
||||
- role: pod
|
||||
relabel_configs:
|
||||
- source_labels:
|
||||
- __meta_kubernetes_pod_controller_name
|
||||
regex: ([0-9a-z-.]+?)(-[0-9a-f]{8,10})?
|
||||
action: replace
|
||||
target_label: __tmp_controller_name
|
||||
- source_labels:
|
||||
- __meta_kubernetes_pod_label_app_kubernetes_io_name
|
||||
- __meta_kubernetes_pod_label_app
|
||||
- __tmp_controller_name
|
||||
- __meta_kubernetes_pod_name
|
||||
regex: ^;*([^;]+)(;.*)?$
|
||||
action: replace
|
||||
target_label: app
|
||||
- source_labels:
|
||||
- __meta_kubernetes_pod_label_app_kubernetes_io_instance
|
||||
- __meta_kubernetes_pod_label_release
|
||||
regex: ^;*([^;]+)(;.*)?$
|
||||
action: replace
|
||||
target_label: instance
|
||||
- source_labels:
|
||||
- __meta_kubernetes_pod_label_app_kubernetes_io_component
|
||||
- __meta_kubernetes_pod_label_component
|
||||
regex: ^;*([^;]+)(;.*)?$
|
||||
action: replace
|
||||
target_label: component
|
||||
|
||||
grafana:
|
||||
enabled: true
|
||||
datasources:
|
||||
datasources.yaml:
|
||||
apiVersion: 1
|
||||
datasources:
|
||||
- name: Loki
|
||||
type: loki
|
||||
access: proxy
|
||||
url: http://loki:3100
|
||||
isDefault: false
|
||||
|
||||
151
monitoring/metrics-dashboard/grafana-dashboard.json
Normal file
151
monitoring/metrics-dashboard/grafana-dashboard.json
Normal file
@@ -0,0 +1,151 @@
|
||||
{
|
||||
"dashboard": {
|
||||
"title": "Integration & Streamlining Success Metrics",
|
||||
"tags": ["metrics", "success", "integration"],
|
||||
"timezone": "browser",
|
||||
"panels": [
|
||||
{
|
||||
"id": 1,
|
||||
"title": "Infrastructure Cost Reduction",
|
||||
"type": "stat",
|
||||
"targets": [
|
||||
{
|
||||
"expr": "infrastructure_cost_reduction_percent",
|
||||
"legendFormat": "Cost Reduction"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "percent",
|
||||
"thresholds": {
|
||||
"steps": [
|
||||
{ "value": 0, "color": "red" },
|
||||
{ "value": 30, "color": "yellow" },
|
||||
{ "value": 40, "color": "green" }
|
||||
]
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": 2,
|
||||
"title": "Shared Infrastructure Adoption",
|
||||
"type": "stat",
|
||||
"targets": [
|
||||
{
|
||||
"expr": "shared_infrastructure_adoption_percent",
|
||||
"legendFormat": "Adoption"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "percent",
|
||||
"thresholds": {
|
||||
"steps": [
|
||||
{ "value": 0, "color": "red" },
|
||||
{ "value": 60, "color": "yellow" },
|
||||
{ "value": 80, "color": "green" }
|
||||
]
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": 3,
|
||||
"title": "Shared Packages Usage",
|
||||
"type": "stat",
|
||||
"targets": [
|
||||
{
|
||||
"expr": "shared_packages_count",
|
||||
"legendFormat": "Packages"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "short",
|
||||
"thresholds": {
|
||||
"steps": [
|
||||
{ "value": 0, "color": "red" },
|
||||
{ "value": 7, "color": "yellow" },
|
||||
{ "value": 10, "color": "green" }
|
||||
]
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": 4,
|
||||
"title": "Deployment Time Reduction",
|
||||
"type": "stat",
|
||||
"targets": [
|
||||
{
|
||||
"expr": "deployment_time_reduction_percent",
|
||||
"legendFormat": "Time Reduction"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "percent",
|
||||
"thresholds": {
|
||||
"steps": [
|
||||
{ "value": 0, "color": "red" },
|
||||
{ "value": 30, "color": "yellow" },
|
||||
{ "value": 50, "color": "green" }
|
||||
]
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": 5,
|
||||
"title": "CI/CD Adoption",
|
||||
"type": "stat",
|
||||
"targets": [
|
||||
{
|
||||
"expr": "cicd_adoption_percent",
|
||||
"legendFormat": "Adoption"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "percent",
|
||||
"thresholds": {
|
||||
"steps": [
|
||||
{ "value": 0, "color": "red" },
|
||||
{ "value": 70, "color": "yellow" },
|
||||
{ "value": 90, "color": "green" }
|
||||
]
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": 6,
|
||||
"title": "Service Uptime",
|
||||
"type": "stat",
|
||||
"targets": [
|
||||
{
|
||||
"expr": "service_uptime_percent",
|
||||
"legendFormat": "Uptime"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "percent",
|
||||
"thresholds": {
|
||||
"steps": [
|
||||
{ "value": 0, "color": "red" },
|
||||
{ "value": 99, "color": "yellow" },
|
||||
{ "value": 99.9, "color": "green" }
|
||||
]
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
],
|
||||
"refresh": "30s",
|
||||
"schemaVersion": 27,
|
||||
"version": 1
|
||||
}
|
||||
}
|
||||
|
||||
32
monitoring/metrics-dashboard/setup.sh
Executable file
32
monitoring/metrics-dashboard/setup.sh
Executable file
@@ -0,0 +1,32 @@
|
||||
#!/bin/bash
|
||||
# Setup metrics dashboard in Grafana
|
||||
|
||||
set -e
|
||||
|
||||
NAMESPACE="monitoring"
|
||||
DASHBOARD_FILE="grafana-dashboard.json"
|
||||
|
||||
echo "📊 Setting up Metrics Dashboard in Grafana..."
|
||||
|
||||
# Check if Grafana is accessible
|
||||
if ! kubectl get svc -n "$NAMESPACE" | grep -q grafana; then
|
||||
echo "⚠️ Grafana not found in namespace $NAMESPACE"
|
||||
echo " → Deploy Grafana first: cd ../prometheus && ./install.sh"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Create ConfigMap with dashboard
|
||||
echo "📝 Creating dashboard ConfigMap..."
|
||||
kubectl create configmap metrics-dashboard \
|
||||
--from-file=dashboard.json="$DASHBOARD_FILE" \
|
||||
-n "$NAMESPACE" \
|
||||
--dry-run=client -o yaml | kubectl apply -f -
|
||||
|
||||
echo "✅ Metrics dashboard configured!"
|
||||
echo ""
|
||||
echo "📝 Next steps:"
|
||||
echo " 1. Access Grafana: kubectl port-forward -n $NAMESPACE svc/prometheus-grafana 3000:80"
|
||||
echo " 2. Import dashboard from ConfigMap"
|
||||
echo " 3. Configure data sources"
|
||||
echo " 4. Set up metrics collection"
|
||||
|
||||
47
monitoring/prometheus/install.sh
Executable file
47
monitoring/prometheus/install.sh
Executable file
@@ -0,0 +1,47 @@
|
||||
#!/bin/bash
|
||||
# Install Prometheus/Grafana Stack
|
||||
|
||||
set -e
|
||||
|
||||
NAMESPACE="monitoring"
|
||||
RELEASE_NAME="prometheus"
|
||||
|
||||
echo "📊 Installing Prometheus/Grafana Stack..."
|
||||
|
||||
# Check if helm is installed
|
||||
if ! command -v helm &> /dev/null; then
|
||||
echo "❌ Helm not found. Please install Helm first."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Check if kubectl is installed
|
||||
if ! command -v kubectl &> /dev/null; then
|
||||
echo "❌ kubectl not found. Please install kubectl first."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Create namespace
|
||||
echo "📦 Creating namespace: $NAMESPACE"
|
||||
kubectl create namespace "$NAMESPACE" --dry-run=client -o yaml | kubectl apply -f -
|
||||
|
||||
# Add Prometheus Helm repo
|
||||
echo "📥 Adding Prometheus Helm repository..."
|
||||
helm repo add prometheus-community https://prometheus-community.github.io/helm-charts
|
||||
helm repo update
|
||||
|
||||
# Install Prometheus Stack
|
||||
echo "🚀 Installing Prometheus Stack..."
|
||||
helm upgrade --install "$RELEASE_NAME" prometheus-community/kube-prometheus-stack \
|
||||
--namespace "$NAMESPACE" \
|
||||
--create-namespace \
|
||||
--values values.yaml \
|
||||
--wait
|
||||
|
||||
echo "✅ Prometheus/Grafana Stack installed successfully!"
|
||||
echo ""
|
||||
echo "📝 Access Grafana:"
|
||||
echo " kubectl port-forward -n $NAMESPACE svc/$RELEASE_NAME-grafana 3000:80"
|
||||
echo ""
|
||||
echo "📝 Access Prometheus:"
|
||||
echo " kubectl port-forward -n $NAMESPACE svc/$RELEASE_NAME-kube-prom-prometheus 9090:9090"
|
||||
|
||||
86
monitoring/prometheus/values.yaml
Normal file
86
monitoring/prometheus/values.yaml
Normal file
@@ -0,0 +1,86 @@
|
||||
# Prometheus Stack Helm Values
|
||||
# For use with kube-prometheus-stack
|
||||
|
||||
prometheus:
|
||||
prometheusSpec:
|
||||
retention: 30d
|
||||
retentionSize: 50GB
|
||||
storageSpec:
|
||||
volumeClaimTemplate:
|
||||
spec:
|
||||
accessModes: ["ReadWriteOnce"]
|
||||
storageClassName: standard
|
||||
resources:
|
||||
requests:
|
||||
storage: 100Gi
|
||||
resources:
|
||||
requests:
|
||||
memory: 2Gi
|
||||
cpu: 1000m
|
||||
limits:
|
||||
memory: 4Gi
|
||||
cpu: 2000m
|
||||
serviceMonitorSelectorNilUsesHelmValues: false
|
||||
podMonitorSelectorNilUsesHelmValues: false
|
||||
ruleSelectorNilUsesHelmValues: false
|
||||
|
||||
grafana:
|
||||
enabled: true
|
||||
adminPassword: "admin" # Change in production
|
||||
persistence:
|
||||
enabled: true
|
||||
size: 10Gi
|
||||
resources:
|
||||
requests:
|
||||
memory: 256Mi
|
||||
cpu: 100m
|
||||
limits:
|
||||
memory: 512Mi
|
||||
cpu: 500m
|
||||
dashboardProviders:
|
||||
dashboardproviders.yaml:
|
||||
apiVersion: 1
|
||||
providers:
|
||||
- name: 'default'
|
||||
orgId: 1
|
||||
folder: ''
|
||||
type: file
|
||||
disableDelete: false
|
||||
editable: true
|
||||
options:
|
||||
path: /var/lib/grafana/dashboards/default
|
||||
dashboards:
|
||||
default:
|
||||
kubernetes-cluster:
|
||||
gnetId: 7249
|
||||
revision: 1
|
||||
datasource: Prometheus
|
||||
|
||||
alertmanager:
|
||||
enabled: true
|
||||
config:
|
||||
global:
|
||||
resolve_timeout: 5m
|
||||
route:
|
||||
group_by: ['alertname']
|
||||
group_wait: 10s
|
||||
group_interval: 10s
|
||||
repeat_interval: 12h
|
||||
receiver: 'default'
|
||||
receivers:
|
||||
- name: 'default'
|
||||
email_configs:
|
||||
- to: 'alerts@example.com'
|
||||
from: 'prometheus@example.com'
|
||||
smarthost: 'smtp.example.com:587'
|
||||
auth_username: 'prometheus'
|
||||
auth_password: 'password'
|
||||
headers:
|
||||
Subject: 'Prometheus Alert'
|
||||
|
||||
kubeStateMetrics:
|
||||
enabled: true
|
||||
|
||||
nodeExporter:
|
||||
enabled: true
|
||||
|
||||
Reference in New Issue
Block a user