mirror of
https://github.com/lordmathis/dev-cluster.git
synced 2025-12-22 16:44:24 +00:00
Update health_monitor.yaml
This commit is contained in:
129
.github/workflows/health_monitor.yaml
vendored
129
.github/workflows/health_monitor.yaml
vendored
@@ -2,132 +2,57 @@ name: Cluster Health Monitor
|
|||||||
|
|
||||||
on:
|
on:
|
||||||
schedule:
|
schedule:
|
||||||
# Run every hour
|
|
||||||
- cron: '0 * * * *'
|
- cron: '0 * * * *'
|
||||||
workflow_dispatch: # Allow manual triggering
|
workflow_dispatch:
|
||||||
|
|
||||||
env:
|
|
||||||
# Timeout for each health check request (in seconds)
|
|
||||||
REQUEST_TIMEOUT: 30
|
|
||||||
# Expected HTTP status codes (comma-separated)
|
|
||||||
EXPECTED_STATUS_CODES: "200,301,302,401,403"
|
|
||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
health-check:
|
health-check:
|
||||||
name: Monitor Cluster Endpoints
|
name: Monitor Cluster Endpoints
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
|
|
||||||
steps:
|
steps:
|
||||||
- name: Set up endpoint monitoring
|
- name: Check endpoints
|
||||||
run: |
|
run: |
|
||||||
# Create endpoints file from secrets
|
# Simple and clean endpoint checker
|
||||||
cat << 'EOF' > endpoints.txt
|
ENDPOINTS="${{ secrets.CLUSTER_ENDPOINTS }}"
|
||||||
${{ secrets.CLUSTER_ENDPOINTS }}
|
|
||||||
EOF
|
|
||||||
|
|
||||||
# Validate endpoints file
|
# Validate input
|
||||||
if [ ! -s endpoints.txt ]; then
|
if [[ -z "$ENDPOINTS" ]]; then
|
||||||
echo "❌ No endpoints found in CLUSTER_ENDPOINTS secret"
|
echo "❌ No endpoints found in CLUSTER_ENDPOINTS secret"
|
||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
|
|
||||||
endpoint_count=$(grep -v '^[[:space:]]*$' endpoints.txt | grep -v '^[[:space:]]*#' | wc -l)
|
# Initialize counters
|
||||||
echo "📋 Found $endpoint_count endpoints to monitor"
|
total=0
|
||||||
echo ""
|
failed=0
|
||||||
|
|
||||||
- name: Monitor endpoints
|
|
||||||
run: |
|
|
||||||
#!/bin/bash
|
|
||||||
set -e
|
|
||||||
|
|
||||||
# Colors for output
|
# Process each endpoint
|
||||||
RED='\033[0;31m'
|
while IFS= read -r url; do
|
||||||
GREEN='\033[0;32m'
|
|
||||||
YELLOW='\033[1;33m'
|
|
||||||
BLUE='\033[0;34m'
|
|
||||||
NC='\033[0m' # No Color
|
|
||||||
|
|
||||||
# Counters
|
|
||||||
total_endpoints=0
|
|
||||||
successful_checks=0
|
|
||||||
failed_checks=0
|
|
||||||
failed_endpoints=()
|
|
||||||
endpoint_index=0
|
|
||||||
|
|
||||||
echo -e "${BLUE}🔍 Starting cluster health monitoring...${NC}"
|
|
||||||
echo ""
|
|
||||||
|
|
||||||
# Read endpoints and perform health checks
|
|
||||||
while IFS= read -r endpoint || [ -n "$endpoint" ]; do
|
|
||||||
# Skip empty lines and comments
|
# Skip empty lines and comments
|
||||||
if [[ -z "$endpoint" || "$endpoint" =~ ^[[:space:]]*# ]]; then
|
[[ -z "$url" || "$url" =~ ^[[:space:]]*# ]] && continue
|
||||||
continue
|
url=$(echo "$url" | xargs)
|
||||||
fi
|
[[ -z "$url" ]] && continue
|
||||||
|
|
||||||
# Trim whitespace
|
total=$((total + 1))
|
||||||
endpoint=$(echo "$endpoint" | xargs)
|
echo "Testing: $url"
|
||||||
|
|
||||||
if [[ -z "$endpoint" ]]; then
|
# Simple curl check - fail if HTTP code >= 400 or connection fails
|
||||||
continue
|
if curl -sfLI --max-time 10 "$url" >/dev/null 2>&1; then
|
||||||
fi
|
echo "✅ OK"
|
||||||
|
|
||||||
total_endpoints=$((total_endpoints + 1))
|
|
||||||
endpoint_index=$((endpoint_index + 1))
|
|
||||||
|
|
||||||
echo -e "${BLUE}Testing endpoint #${endpoint_index}:${NC}"
|
|
||||||
|
|
||||||
# Perform health check with curl
|
|
||||||
response=$(curl -s -w "\n%{http_code}\n%{time_total}" \
|
|
||||||
--max-time $REQUEST_TIMEOUT \
|
|
||||||
--connect-timeout 10 \
|
|
||||||
--retry 2 \
|
|
||||||
--retry-delay 1 \
|
|
||||||
--user-agent "GitHub-Actions-Health-Monitor/1.0" \
|
|
||||||
--location \
|
|
||||||
--insecure \
|
|
||||||
"$endpoint" 2>/dev/null || echo -e "\nERROR\n0")
|
|
||||||
|
|
||||||
# Parse response
|
|
||||||
http_code=$(echo "$response" | tail -n 2 | head -n 1)
|
|
||||||
response_time=$(echo "$response" | tail -n 1)
|
|
||||||
|
|
||||||
# Check if request was successful
|
|
||||||
if [[ "$http_code" == "ERROR" || -z "$http_code" ]]; then
|
|
||||||
echo -e " ${RED}❌ Connection failed${NC}"
|
|
||||||
failed_checks=$((failed_checks + 1))
|
|
||||||
failed_endpoints+=("Endpoint #$endpoint_index (Connection failed)")
|
|
||||||
else
|
else
|
||||||
# Check if status code is expected
|
echo "❌ FAILED"
|
||||||
if [[ ",$EXPECTED_STATUS_CODES," == *",$http_code,"* ]]; then
|
failed=$((failed + 1))
|
||||||
echo -e " ${GREEN}✅ HTTP $http_code${NC} (${response_time}s)"
|
|
||||||
successful_checks=$((successful_checks + 1))
|
|
||||||
else
|
|
||||||
echo -e " ${RED}❌ HTTP $http_code${NC} (${response_time}s) - Unexpected status code"
|
|
||||||
failed_checks=$((failed_checks + 1))
|
|
||||||
failed_endpoints+=("Endpoint #$endpoint_index (HTTP $http_code)")
|
|
||||||
fi
|
|
||||||
fi
|
fi
|
||||||
|
|
||||||
echo ""
|
echo ""
|
||||||
|
|
||||||
done < endpoints.txt
|
done <<< "$ENDPOINTS"
|
||||||
|
|
||||||
# Summary
|
# Summary
|
||||||
echo -e "${BLUE}📊 Health Check Summary:${NC}"
|
echo "📊 Summary: $((total - failed))/$total endpoints OK"
|
||||||
echo -e " Total endpoints: $total_endpoints"
|
|
||||||
echo -e " ${GREEN}Successful: $successful_checks${NC}"
|
|
||||||
echo -e " ${RED}Failed: $failed_checks${NC}"
|
|
||||||
echo ""
|
|
||||||
|
|
||||||
# Report failed endpoints
|
# Exit with error if any failed
|
||||||
if [ $failed_checks -gt 0 ]; then
|
if [[ $failed -gt 0 ]]; then
|
||||||
echo -e "${RED}💥 Failed endpoints:${NC}"
|
echo "💥 $failed endpoint(s) failed!"
|
||||||
for failed_endpoint in "${failed_endpoints[@]}"; do
|
|
||||||
echo -e " ${RED}• $failed_endpoint${NC}"
|
|
||||||
done
|
|
||||||
echo ""
|
|
||||||
|
|
||||||
exit 1
|
exit 1
|
||||||
else
|
else
|
||||||
echo -e "${GREEN}🎉 All endpoints are healthy!${NC}"
|
echo "🎉 All endpoints healthy!"
|
||||||
fi
|
fi
|
||||||
|
|||||||
Reference in New Issue
Block a user