#!/usr/bin/env bash # Comprehensive Azure deployment failure analysis # Compares Terraform logs with Azure activity logs set -e SUBSCRIPTION_ID="fc08d829-4f14-413d-ab27-ce024425db0b" PROJECT_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)" echo "╔════════════════════════════════════════════════════════════════╗" echo "║ AZURE DEPLOYMENT FAILURE ANALYSIS ║" echo "╚════════════════════════════════════════════════════════════════╝" echo "" echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━" echo "Part 1: Failed Clusters Analysis" echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━" echo "" FAILED_CLUSTERS=$(az aks list --subscription "$SUBSCRIPTION_ID" \ --query "[?contains(name, 'az-p-') && provisioningState == 'Failed'].{name:name, rg:resourceGroup}" -o json) FAILED_COUNT=$(echo "$FAILED_CLUSTERS" | jq '. | length') echo "Found $FAILED_COUNT failed clusters" echo "" if [ "$FAILED_COUNT" -gt 0 ]; then echo "$FAILED_CLUSTERS" | jq -r '.[] | "\(.rg)|\(.name)"' | while IFS='|' read -r rg name; do echo "Cluster: $name" echo "Resource Group: $rg" echo "" # Get cluster details echo "Cluster Details:" az aks show --resource-group "$rg" --name "$name" --subscription "$SUBSCRIPTION_ID" \ --query "{provisioningState:provisioningState, powerState:powerState.code, createdTime:createdAt, kubernetesVersion:kubernetesVersion}" -o json 2>&1 | jq '.' || echo " Error retrieving details" echo "" # Get activity log errors echo "Recent Errors from Activity Log:" az monitor activity-log list --subscription "$SUBSCRIPTION_ID" \ --resource-group "$rg" \ --resource-id "/subscriptions/$SUBSCRIPTION_ID/resourceGroups/$rg/providers/Microsoft.ContainerService/managedClusters/$name" \ --max-events 20 \ --query "[?status.value == 'Failed' || level == 'Error'].{time:eventTimestamp, operation:operationName.localValue, status:status.value, message:statusMessage.message, error:properties.statusMessage}" -o json 2>&1 | \ jq -r '.[] | " [\(.time)] \(.operation): \(.message // .error)"' | head -5 || echo " No errors found" echo "" echo "---" echo "" done fi echo "" echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━" echo "Part 2: Canceled Clusters Analysis" echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━" echo "" CANCELED_CLUSTERS=$(az aks list --subscription "$SUBSCRIPTION_ID" \ --query "[?contains(name, 'az-p-') && provisioningState == 'Canceled'].{name:name, rg:resourceGroup}" -o json) CANCELED_COUNT=$(echo "$CANCELED_CLUSTERS" | jq '. | length') echo "Found $CANCELED_COUNT canceled clusters" echo "" if [ "$CANCELED_COUNT" -gt 0 ]; then echo "$CANCELED_CLUSTERS" | jq -r '.[:5][] | "\(.rg)|\(.name)"' | while IFS='|' read -r rg name; do echo "Cluster: $name" echo "Resource Group: $rg" echo "" # Get cluster details az aks show --resource-group "$rg" --name "$name" --subscription "$SUBSCRIPTION_ID" \ --query "{provisioningState:provisioningState, powerState:powerState.code, createdTime:createdAt}" -o json 2>&1 | jq '.' || echo " Error retrieving details" echo "" # Get activity log echo "Recent Activity:" az monitor activity-log list --subscription "$SUBSCRIPTION_ID" \ --resource-group "$rg" \ --resource-id "/subscriptions/$SUBSCRIPTION_ID/resourceGroups/$rg/providers/Microsoft.ContainerService/managedClusters/$name" \ --max-events 10 \ --query "[].{time:eventTimestamp, operation:operationName.localValue, status:status.value}" -o json 2>&1 | \ jq -r '.[] | " [\(.time)] \(.operation): \(.status)"' | head -5 || echo " No activity found" echo "" done fi echo "" echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━" echo "Part 3: Recent Errors Across Subscription" echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━" echo "" echo "Checking recent errors for AKS clusters..." az monitor activity-log list --subscription "$SUBSCRIPTION_ID" \ --resource-type "Microsoft.ContainerService/managedClusters" \ --max-events 30 \ --query "[?status.value == 'Failed' || level == 'Error'].{time:eventTimestamp, resource:resourceId, operation:operationName.localValue, status:status.value, message:statusMessage.message, error:properties.statusMessage}" -o json 2>&1 | \ jq -r '.[] | "\(.time) | \(.resource | split("/") | .[-2] + "/" + .[-1]) | \(.operation) | \(.status) | \(.message // .error)"' | \ sort -r | head -15 || echo "No errors found" echo "" echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━" echo "Part 4: Terraform Log Analysis" echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━" echo "" if [ -f "$PROJECT_ROOT/tmp/terraform-apply-unlocked.log" ]; then echo "Terraform Log: /tmp/terraform-apply-unlocked.log" echo "Errors found: $(grep -i "error" "$PROJECT_ROOT/tmp/terraform-apply-unlocked.log" | wc -l)" echo "" echo "Key Error Messages:" grep -i "stopped state\|operation not allowed\|already exists" "$PROJECT_ROOT/tmp/terraform-apply-unlocked.log" | head -5 | sed 's/^/ /' else echo "Terraform log not found" fi echo "" echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━" echo "Part 5: Comparison Summary" echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━" echo "" echo "Comparing Terraform logs with Azure logs..." echo "✅ Analysis complete - See details above"