Files
smom-dbis-138/scripts/azure/fix-deployment-issues.sh
defiQUG 1fb7266469 Add Oracle Aggregator and CCIP Integration
- Introduced Aggregator.sol for Chainlink-compatible oracle functionality, including round-based updates and access control.
- Added OracleWithCCIP.sol to extend Aggregator with CCIP cross-chain messaging capabilities.
- Created .gitmodules to include OpenZeppelin contracts as a submodule.
- Developed a comprehensive deployment guide in NEXT_STEPS_COMPLETE_GUIDE.md for Phase 2 and smart contract deployment.
- Implemented Vite configuration for the orchestration portal, supporting both Vue and React frameworks.
- Added server-side logic for the Multi-Cloud Orchestration Portal, including API endpoints for environment management and monitoring.
- Created scripts for resource import and usage validation across non-US regions.
- Added tests for CCIP error handling and integration to ensure robust functionality.
- Included various new files and directories for the orchestration portal and deployment scripts.
2025-12-12 14:57:48 -08:00

193 lines
8.2 KiB
Bash
Executable File

#!/usr/bin/env bash
# Comprehensive fix for deployment issues
# Deletes failed/canceled clusters and re-runs Terraform
set -e
SUBSCRIPTION_ID="fc08d829-4f14-413d-ab27-ce024425db0b"
PROJECT_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)"
TERRAFORM_DIR="$PROJECT_ROOT/terraform/well-architected/cloud-sovereignty"
echo "╔════════════════════════════════════════════════════════════════╗"
echo "║ DEPLOYMENT FIX - COMPREHENSIVE CLEANUP & REDEPLOYMENT ║"
echo "╚════════════════════════════════════════════════════════════════╝"
echo ""
echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
echo "Step 1: Delete Failed Clusters (7)"
echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
echo ""
FAILED_CLUSTERS=$(az aks list --subscription "$SUBSCRIPTION_ID" \
--query "[?contains(name, 'az-p-') && provisioningState == 'Failed'].{name:name, rg:resourceGroup}" -o json)
FAILED_COUNT=$(echo "$FAILED_CLUSTERS" | jq '. | length')
echo "Found $FAILED_COUNT failed clusters to delete"
echo ""
if [ "$FAILED_COUNT" -gt 0 ]; then
echo "$FAILED_CLUSTERS" | jq -r '.[] | "\(.rg)|\(.name)"' | while IFS='|' read -r rg name; do
echo "Deleting failed cluster: $name (RG: $rg)"
az aks delete --resource-group "$rg" --name "$name" --subscription "$SUBSCRIPTION_ID" --yes --no-wait 2>&1 | grep -v "^$" || true
echo " ✅ Deletion initiated"
echo ""
done
echo "Waiting for failed cluster deletions to complete..."
sleep 30
# Wait for deletions
echo "$FAILED_CLUSTERS" | jq -r '.[] | "\(.rg)|\(.name)"' | while IFS='|' read -r rg name; do
echo -n " Waiting for $name..."
while az aks show --resource-group "$rg" --name "$name" --subscription "$SUBSCRIPTION_ID" >/dev/null 2>&1; do
echo -n "."
sleep 5
done
echo " ✅ Deleted"
done
else
echo "No failed clusters to delete"
fi
echo ""
echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
echo "Step 2: Delete Canceled Clusters (16)"
echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
echo ""
CANCELED_CLUSTERS=$(az aks list --subscription "$SUBSCRIPTION_ID" \
--query "[?contains(name, 'az-p-') && provisioningState == 'Canceled'].{name:name, rg:resourceGroup}" -o json)
CANCELED_COUNT=$(echo "$CANCELED_CLUSTERS" | jq '. | length')
echo "Found $CANCELED_COUNT canceled clusters to delete"
echo ""
if [ "$CANCELED_COUNT" -gt 0 ]; then
echo "$CANCELED_CLUSTERS" | jq -r '.[] | "\(.rg)|\(.name)"' | while IFS='|' read -r rg name; do
echo "Deleting canceled cluster: $name (RG: $rg)"
az aks delete --resource-group "$rg" --name "$name" --subscription "$SUBSCRIPTION_ID" --yes --no-wait 2>&1 | grep -v "^$" || true
echo " ✅ Deletion initiated"
echo ""
done
echo "Waiting for canceled cluster deletions to complete..."
sleep 30
# Wait for deletions (in batches)
BATCH_SIZE=5
BATCH_NUM=0
echo "$CANCELED_CLUSTERS" | jq -r '.[] | "\(.rg)|\(.name)"' | while IFS='|' read -r rg name; do
echo -n " Waiting for $name..."
while az aks show --resource-group "$rg" --name "$name" --subscription "$SUBSCRIPTION_ID" >/dev/null 2>&1; do
echo -n "."
sleep 5
done
echo " ✅ Deleted"
done
else
echo "No canceled clusters to delete"
fi
echo ""
echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
echo "Step 3: Clean Terraform State"
echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
echo ""
cd "$TERRAFORM_DIR"
echo "Removing deleted clusters from Terraform state..."
echo ""
# Get list of all cluster resources in state
TERRAFORM_STATE_CLUSTERS=$(terraform state list 2>/dev/null | grep "azurerm_kubernetes_cluster" || true)
if [ -n "$TERRAFORM_STATE_CLUSTERS" ]; then
echo "Checking state for cluster resources..."
echo "$TERRAFORM_STATE_CLUSTERS" | while read -r resource; do
CLUSTER_NAME=$(echo "$resource" | sed 's/.*\.main\[.*\]//' || echo "$resource" | awk -F'.' '{print $NF}')
echo " Checking: $resource"
# Try to check if cluster still exists
if echo "$resource" | grep -q "azurerm_kubernetes_cluster"; then
echo " Resource in state: $resource"
fi
done
else
echo "No cluster resources found in Terraform state"
fi
echo ""
echo "Note: Terraform will automatically handle state cleanup during apply"
echo ""
echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
echo "Step 4: Re-run Terraform Deployment"
echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
echo ""
echo "Initializing Terraform..."
terraform init -upgrade >/dev/null 2>&1 || true
echo ""
echo "Re-running Terraform deployment..."
echo "This will recreate all deleted clusters with proper configuration"
echo ""
echo "⚠️ This may take 15-30 minutes depending on region availability"
echo ""
# Run Terraform apply with maximum parallelism
terraform apply -parallelism=128 -auto-approve 2>&1 | tee /tmp/terraform-apply-fixed.log
echo ""
echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
echo "Step 5: Verify Deployment"
echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
echo ""
echo "Waiting 30 seconds for clusters to stabilize..."
sleep 30
echo ""
echo "Checking cluster status..."
echo ""
READY_COUNT=$(az aks list --subscription "$SUBSCRIPTION_ID" \
--query "[?contains(name, 'az-p-') && provisioningState == 'Succeeded'].name" -o tsv | wc -l)
FAILED_COUNT=$(az aks list --subscription "$SUBSCRIPTION_ID" \
--query "[?contains(name, 'az-p-') && provisioningState == 'Failed'].name" -o tsv | wc -l)
CREATING_COUNT=$(az aks list --subscription "$SUBSCRIPTION_ID" \
--query "[?contains(name, 'az-p-') && provisioningState == 'Creating'].name" -o tsv | wc -l)
echo "📊 Deployment Status:"
echo " ✅ Ready (Succeeded): $READY_COUNT"
echo " ❌ Failed: $FAILED_COUNT"
echo " ⏳ Creating: $CREATING_COUNT"
echo ""
if [ "$CREATING_COUNT" -gt 0 ]; then
echo "⚠️ Some clusters are still creating. Monitor with:"
echo " az aks list --subscription $SUBSCRIPTION_ID --query \"[?contains(name, 'az-p-')].{name:name, state:provisioningState}\" -o table"
fi
if [ "$FAILED_COUNT" -gt 0 ]; then
echo "⚠️ Some clusters failed. Check logs:"
echo " tail -100 /tmp/terraform-apply-fixed.log"
echo " ./scripts/azure/analyze-deployment-failures.sh"
fi
echo ""
echo "✅ Fix process complete!"
echo ""
echo "📝 Logs:"
echo " • Terraform: /tmp/terraform-apply-fixed.log"
echo " • This script: Check output above"
echo ""
echo "🎯 Next Steps:"
echo " 1. Monitor cluster creation: az aks list --query \"[?contains(name, 'az-p-')].{name:name, state:provisioningState}\" -o table"
echo " 2. Once ready, run: ./scripts/deployment/wait-and-run-all-next-steps.sh"
echo ""