Initial commit
This commit is contained in:
500
skills/scripts/check_namespace.py
Executable file
500
skills/scripts/check_namespace.py
Executable file
@@ -0,0 +1,500 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Kubernetes Namespace Health Check
|
||||
Performs comprehensive health diagnostics for a specific namespace
|
||||
"""
|
||||
import argparse
|
||||
import json
|
||||
import subprocess
|
||||
import sys
|
||||
from typing import Dict, List, Any
|
||||
from datetime import datetime
|
||||
|
||||
|
||||
def run_kubectl(args: List[str], namespace: str = None) -> Dict[str, Any]:
|
||||
"""Run kubectl command and return parsed JSON"""
|
||||
cmd = ['kubectl'] + args
|
||||
if namespace and '-n' not in args and '--namespace' not in args:
|
||||
cmd.extend(['-n', namespace])
|
||||
|
||||
try:
|
||||
result = subprocess.run(
|
||||
cmd,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
check=True
|
||||
)
|
||||
if result.stdout:
|
||||
return json.loads(result.stdout)
|
||||
return {}
|
||||
except subprocess.CalledProcessError as e:
|
||||
return {"error": e.stderr}
|
||||
except json.JSONDecodeError:
|
||||
return {"error": "Failed to parse kubectl output", "output": result.stdout}
|
||||
|
||||
|
||||
def check_pods(namespace: str) -> Dict[str, Any]:
|
||||
"""Check pod health in namespace"""
|
||||
pods = run_kubectl(['get', 'pods', '-o', 'json'], namespace)
|
||||
|
||||
if 'error' in pods:
|
||||
return pods
|
||||
|
||||
results = {
|
||||
"total": 0,
|
||||
"running": 0,
|
||||
"pending": 0,
|
||||
"failed": 0,
|
||||
"succeeded": 0,
|
||||
"crashlooping": 0,
|
||||
"image_pull_errors": 0,
|
||||
"issues": [],
|
||||
"healthy_pods": [],
|
||||
"unhealthy_pods": []
|
||||
}
|
||||
|
||||
for pod in pods.get('items', []):
|
||||
name = pod['metadata']['name']
|
||||
phase = pod.get('status', {}).get('phase', 'Unknown')
|
||||
results["total"] += 1
|
||||
|
||||
# Check container statuses
|
||||
container_statuses = pod.get('status', {}).get('containerStatuses', [])
|
||||
restart_count = sum(c.get('restartCount', 0) for c in container_statuses)
|
||||
|
||||
# Categorize pod status
|
||||
if phase == 'Running':
|
||||
all_ready = all(c.get('ready', False) for c in container_statuses)
|
||||
if all_ready and restart_count < 5:
|
||||
results["running"] += 1
|
||||
results["healthy_pods"].append(name)
|
||||
else:
|
||||
results["running"] += 1
|
||||
if restart_count >= 5:
|
||||
results["crashlooping"] += 1
|
||||
results["issues"].append(f"Pod {name}: High restart count ({restart_count})")
|
||||
results["unhealthy_pods"].append(name)
|
||||
if not all_ready:
|
||||
results["issues"].append(f"Pod {name}: Not all containers ready")
|
||||
results["unhealthy_pods"].append(name)
|
||||
|
||||
elif phase == 'Pending':
|
||||
results["pending"] += 1
|
||||
results["issues"].append(f"Pod {name}: Stuck in Pending state")
|
||||
results["unhealthy_pods"].append(name)
|
||||
|
||||
elif phase == 'Failed':
|
||||
results["failed"] += 1
|
||||
results["issues"].append(f"Pod {name}: Failed")
|
||||
results["unhealthy_pods"].append(name)
|
||||
|
||||
elif phase == 'Succeeded':
|
||||
results["succeeded"] += 1
|
||||
|
||||
# Check for ImagePullBackOff
|
||||
for container_status in container_statuses:
|
||||
waiting = container_status.get('state', {}).get('waiting', {})
|
||||
reason = waiting.get('reason', '')
|
||||
if 'ImagePull' in reason or 'ErrImagePull' in reason:
|
||||
results["image_pull_errors"] += 1
|
||||
if name not in results["unhealthy_pods"]:
|
||||
results["unhealthy_pods"].append(name)
|
||||
results["issues"].append(f"Pod {name}: {reason}")
|
||||
|
||||
return results
|
||||
|
||||
|
||||
def check_services(namespace: str) -> Dict[str, Any]:
|
||||
"""Check services and their endpoints"""
|
||||
services = run_kubectl(['get', 'services', '-o', 'json'], namespace)
|
||||
|
||||
if 'error' in services:
|
||||
return services
|
||||
|
||||
results = {
|
||||
"total": 0,
|
||||
"with_endpoints": 0,
|
||||
"without_endpoints": 0,
|
||||
"load_balancers": 0,
|
||||
"load_balancers_pending": 0,
|
||||
"issues": []
|
||||
}
|
||||
|
||||
for svc in services.get('items', []):
|
||||
name = svc['metadata']['name']
|
||||
svc_type = svc['spec'].get('type', 'ClusterIP')
|
||||
results["total"] += 1
|
||||
|
||||
# Check endpoints
|
||||
endpoints = run_kubectl(['get', 'endpoints', name, '-o', 'json'], namespace)
|
||||
if 'error' not in endpoints:
|
||||
subsets = endpoints.get('subsets', [])
|
||||
if subsets and any(s.get('addresses', []) for s in subsets):
|
||||
results["with_endpoints"] += 1
|
||||
else:
|
||||
results["without_endpoints"] += 1
|
||||
results["issues"].append(f"Service {name}: No endpoints (no pods matching selector)")
|
||||
|
||||
# Check LoadBalancer status
|
||||
if svc_type == 'LoadBalancer':
|
||||
results["load_balancers"] += 1
|
||||
lb_ingress = svc['status'].get('loadBalancer', {}).get('ingress', [])
|
||||
if not lb_ingress:
|
||||
results["load_balancers_pending"] += 1
|
||||
results["issues"].append(f"Service {name}: LoadBalancer stuck in Pending")
|
||||
|
||||
return results
|
||||
|
||||
|
||||
def check_deployments(namespace: str) -> Dict[str, Any]:
|
||||
"""Check deployment health"""
|
||||
deployments = run_kubectl(['get', 'deployments', '-o', 'json'], namespace)
|
||||
|
||||
if 'error' in deployments:
|
||||
return deployments
|
||||
|
||||
results = {
|
||||
"total": 0,
|
||||
"available": 0,
|
||||
"unavailable": 0,
|
||||
"progressing": 0,
|
||||
"issues": []
|
||||
}
|
||||
|
||||
for deploy in deployments.get('items', []):
|
||||
name = deploy['metadata']['name']
|
||||
results["total"] += 1
|
||||
|
||||
status = deploy.get('status', {})
|
||||
replicas = status.get('replicas', 0)
|
||||
ready_replicas = status.get('readyReplicas', 0)
|
||||
available_replicas = status.get('availableReplicas', 0)
|
||||
|
||||
if available_replicas == replicas and available_replicas > 0:
|
||||
results["available"] += 1
|
||||
elif available_replicas == 0:
|
||||
results["unavailable"] += 1
|
||||
results["issues"].append(f"Deployment {name}: No replicas available ({ready_replicas}/{replicas})")
|
||||
else:
|
||||
results["progressing"] += 1
|
||||
results["issues"].append(f"Deployment {name}: Partially available ({available_replicas}/{replicas})")
|
||||
|
||||
return results
|
||||
|
||||
|
||||
def check_pvcs(namespace: str) -> Dict[str, Any]:
|
||||
"""Check PersistentVolumeClaims"""
|
||||
pvcs = run_kubectl(['get', 'pvc', '-o', 'json'], namespace)
|
||||
|
||||
if 'error' in pvcs:
|
||||
return pvcs
|
||||
|
||||
results = {
|
||||
"total": 0,
|
||||
"bound": 0,
|
||||
"pending": 0,
|
||||
"lost": 0,
|
||||
"issues": []
|
||||
}
|
||||
|
||||
for pvc in pvcs.get('items', []):
|
||||
name = pvc['metadata']['name']
|
||||
phase = pvc.get('status', {}).get('phase', 'Unknown')
|
||||
results["total"] += 1
|
||||
|
||||
if phase == 'Bound':
|
||||
results["bound"] += 1
|
||||
elif phase == 'Pending':
|
||||
results["pending"] += 1
|
||||
results["issues"].append(f"PVC {name}: Stuck in Pending state")
|
||||
elif phase == 'Lost':
|
||||
results["lost"] += 1
|
||||
results["issues"].append(f"PVC {name}: Volume lost")
|
||||
|
||||
return results
|
||||
|
||||
|
||||
def check_resource_quotas(namespace: str) -> Dict[str, Any]:
|
||||
"""Check resource quotas and usage"""
|
||||
quotas = run_kubectl(['get', 'resourcequota', '-o', 'json'], namespace)
|
||||
|
||||
if 'error' in quotas:
|
||||
return {"total": 0, "issues": []}
|
||||
|
||||
results = {
|
||||
"total": 0,
|
||||
"near_limit": [],
|
||||
"exceeded": [],
|
||||
"issues": []
|
||||
}
|
||||
|
||||
for quota in quotas.get('items', []):
|
||||
name = quota['metadata']['name']
|
||||
results["total"] += 1
|
||||
|
||||
status = quota.get('status', {})
|
||||
hard = status.get('hard', {})
|
||||
used = status.get('used', {})
|
||||
|
||||
for resource, limit in hard.items():
|
||||
usage = used.get(resource, '0')
|
||||
|
||||
# Parse values (handle different formats: CPU, memory, counts)
|
||||
try:
|
||||
if resource.endswith('memory'):
|
||||
# Convert to bytes for comparison
|
||||
limit_val = parse_memory(limit)
|
||||
usage_val = parse_memory(usage)
|
||||
elif resource.endswith('cpu'):
|
||||
# Convert to millicores
|
||||
limit_val = parse_cpu(limit)
|
||||
usage_val = parse_cpu(usage)
|
||||
else:
|
||||
# Plain numbers
|
||||
limit_val = int(limit)
|
||||
usage_val = int(usage)
|
||||
|
||||
if limit_val > 0:
|
||||
usage_percent = (usage_val / limit_val) * 100
|
||||
|
||||
if usage_percent >= 100:
|
||||
results["exceeded"].append(resource)
|
||||
results["issues"].append(f"Quota {name}: {resource} exceeded ({usage}/{limit})")
|
||||
elif usage_percent >= 80:
|
||||
results["near_limit"].append(resource)
|
||||
results["issues"].append(f"Quota {name}: {resource} near limit ({usage}/{limit}, {usage_percent:.0f}%)")
|
||||
|
||||
except (ValueError, AttributeError):
|
||||
continue
|
||||
|
||||
return results
|
||||
|
||||
|
||||
def parse_memory(value: str) -> int:
|
||||
"""Parse memory string to bytes"""
|
||||
units = {'Ki': 1024, 'Mi': 1024**2, 'Gi': 1024**3, 'Ti': 1024**4}
|
||||
for unit, multiplier in units.items():
|
||||
if value.endswith(unit):
|
||||
return int(value[:-2]) * multiplier
|
||||
return int(value)
|
||||
|
||||
|
||||
def parse_cpu(value: str) -> int:
|
||||
"""Parse CPU string to millicores"""
|
||||
if value.endswith('m'):
|
||||
return int(value[:-1])
|
||||
return int(float(value) * 1000)
|
||||
|
||||
|
||||
def get_recent_events(namespace: str, limit: int = 10) -> List[Dict[str, Any]]:
|
||||
"""Get recent events in namespace"""
|
||||
events = run_kubectl(['get', 'events', '--sort-by=.lastTimestamp', '-o', 'json'], namespace)
|
||||
|
||||
if 'error' in events:
|
||||
return []
|
||||
|
||||
recent_events = []
|
||||
for event in events.get('items', [])[-limit:]:
|
||||
recent_events.append({
|
||||
"type": event.get('type', 'Unknown'),
|
||||
"reason": event.get('reason', ''),
|
||||
"message": event.get('message', ''),
|
||||
"object": f"{event.get('involvedObject', {}).get('kind', '')}/{event.get('involvedObject', {}).get('name', '')}",
|
||||
"count": event.get('count', 1),
|
||||
"last_timestamp": event.get('lastTimestamp', '')
|
||||
})
|
||||
|
||||
return recent_events
|
||||
|
||||
|
||||
def generate_recommendations(results: Dict[str, Any]) -> List[str]:
|
||||
"""Generate actionable recommendations based on findings"""
|
||||
recommendations = []
|
||||
|
||||
# Pod recommendations
|
||||
if results['pods']['pending'] > 0:
|
||||
recommendations.append("⚠️ Check pending pods with: kubectl describe pod <pod-name> -n <namespace>")
|
||||
recommendations.append("⚠️ Verify node resources: kubectl describe nodes")
|
||||
|
||||
if results['pods']['crashlooping'] > 0:
|
||||
recommendations.append("⚠️ Investigate crashlooping pods: kubectl logs <pod-name> -n <namespace> --previous")
|
||||
|
||||
if results['pods']['image_pull_errors'] > 0:
|
||||
recommendations.append("⚠️ Fix image pull errors: verify image name, check imagePullSecrets")
|
||||
|
||||
# Service recommendations
|
||||
if results['services']['without_endpoints'] > 0:
|
||||
recommendations.append("⚠️ Services without endpoints: check pod selectors match pod labels")
|
||||
|
||||
if results['services']['load_balancers_pending'] > 0:
|
||||
recommendations.append("⚠️ LoadBalancer stuck: check cloud provider controller logs")
|
||||
|
||||
# Deployment recommendations
|
||||
if results['deployments']['unavailable'] > 0:
|
||||
recommendations.append("⚠️ Unavailable deployments: check pod errors and resource availability")
|
||||
|
||||
# PVC recommendations
|
||||
if results['pvcs']['pending'] > 0:
|
||||
recommendations.append("⚠️ Pending PVCs: verify StorageClass exists and provisioner is working")
|
||||
|
||||
# Quota recommendations
|
||||
if results['quotas']['exceeded']:
|
||||
recommendations.append(f"🚨 Resource quotas exceeded: {', '.join(results['quotas']['exceeded'])}")
|
||||
recommendations.append("🚨 Action required: increase quota or reduce resource requests")
|
||||
|
||||
if results['quotas']['near_limit']:
|
||||
recommendations.append(f"⚠️ Near quota limits: {', '.join(results['quotas']['near_limit'])}")
|
||||
|
||||
if not recommendations:
|
||||
recommendations.append("✅ No critical issues detected")
|
||||
|
||||
return recommendations
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Comprehensive health check for a Kubernetes namespace",
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||
epilog="""
|
||||
Examples:
|
||||
# Check namespace with human-readable output
|
||||
%(prog)s my-namespace
|
||||
|
||||
# Output as JSON
|
||||
%(prog)s my-namespace --json
|
||||
|
||||
# Include more events
|
||||
%(prog)s my-namespace --events 20
|
||||
"""
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"namespace",
|
||||
help="Namespace to check"
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--json",
|
||||
action="store_true",
|
||||
help="Output results as JSON"
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--events",
|
||||
type=int,
|
||||
default=10,
|
||||
help="Number of recent events to include (default: 10)"
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# Perform all checks
|
||||
results = {
|
||||
"namespace": args.namespace,
|
||||
"timestamp": datetime.utcnow().isoformat() + "Z",
|
||||
"pods": check_pods(args.namespace),
|
||||
"services": check_services(args.namespace),
|
||||
"deployments": check_deployments(args.namespace),
|
||||
"pvcs": check_pvcs(args.namespace),
|
||||
"quotas": check_resource_quotas(args.namespace),
|
||||
"recent_events": get_recent_events(args.namespace, args.events)
|
||||
}
|
||||
|
||||
# Generate recommendations
|
||||
results["recommendations"] = generate_recommendations(results)
|
||||
|
||||
# Determine overall health
|
||||
total_issues = (
|
||||
len(results["pods"].get("issues", [])) +
|
||||
len(results["services"].get("issues", [])) +
|
||||
len(results["deployments"].get("issues", [])) +
|
||||
len(results["pvcs"].get("issues", [])) +
|
||||
len(results["quotas"].get("issues", []))
|
||||
)
|
||||
|
||||
results["health_status"] = "healthy" if total_issues == 0 else "degraded" if total_issues < 5 else "critical"
|
||||
|
||||
if args.json:
|
||||
print(json.dumps(results, indent=2))
|
||||
else:
|
||||
# Human-readable output
|
||||
print(f"🔍 Namespace Health Check: {args.namespace}")
|
||||
print(f"⏰ Timestamp: {results['timestamp']}")
|
||||
print(f"📊 Overall Status: {results['health_status'].upper()}\n")
|
||||
|
||||
# Pods
|
||||
print("📦 Pods:")
|
||||
print(f" Total: {results['pods']['total']}")
|
||||
print(f" Running: {results['pods']['running']}")
|
||||
print(f" Pending: {results['pods']['pending']}")
|
||||
print(f" Failed: {results['pods']['failed']}")
|
||||
if results['pods']['crashlooping'] > 0:
|
||||
print(f" ⚠️ CrashLooping: {results['pods']['crashlooping']}")
|
||||
if results['pods']['image_pull_errors'] > 0:
|
||||
print(f" ⚠️ ImagePull Errors: {results['pods']['image_pull_errors']}")
|
||||
print()
|
||||
|
||||
# Services
|
||||
print("🌐 Services:")
|
||||
print(f" Total: {results['services']['total']}")
|
||||
print(f" With Endpoints: {results['services']['with_endpoints']}")
|
||||
if results['services']['without_endpoints'] > 0:
|
||||
print(f" ⚠️ Without Endpoints: {results['services']['without_endpoints']}")
|
||||
if results['services']['load_balancers_pending'] > 0:
|
||||
print(f" ⚠️ LB Pending: {results['services']['load_balancers_pending']}")
|
||||
print()
|
||||
|
||||
# Deployments
|
||||
if results['deployments']['total'] > 0:
|
||||
print("🚀 Deployments:")
|
||||
print(f" Total: {results['deployments']['total']}")
|
||||
print(f" Available: {results['deployments']['available']}")
|
||||
if results['deployments']['unavailable'] > 0:
|
||||
print(f" ⚠️ Unavailable: {results['deployments']['unavailable']}")
|
||||
print()
|
||||
|
||||
# PVCs
|
||||
if results['pvcs']['total'] > 0:
|
||||
print("💾 PersistentVolumeClaims:")
|
||||
print(f" Total: {results['pvcs']['total']}")
|
||||
print(f" Bound: {results['pvcs']['bound']}")
|
||||
if results['pvcs']['pending'] > 0:
|
||||
print(f" ⚠️ Pending: {results['pvcs']['pending']}")
|
||||
print()
|
||||
|
||||
# Quotas
|
||||
if results['quotas']['total'] > 0:
|
||||
print("📏 Resource Quotas:")
|
||||
print(f" Total: {results['quotas']['total']}")
|
||||
if results['quotas']['exceeded']:
|
||||
print(f" 🚨 Exceeded: {', '.join(results['quotas']['exceeded'])}")
|
||||
if results['quotas']['near_limit']:
|
||||
print(f" ⚠️ Near Limit: {', '.join(results['quotas']['near_limit'])}")
|
||||
print()
|
||||
|
||||
# Issues
|
||||
if total_issues > 0:
|
||||
print(f"⚠️ Issues ({total_issues}):")
|
||||
all_issues = (
|
||||
results["pods"].get("issues", []) +
|
||||
results["services"].get("issues", []) +
|
||||
results["deployments"].get("issues", []) +
|
||||
results["pvcs"].get("issues", []) +
|
||||
results["quotas"].get("issues", [])
|
||||
)
|
||||
for issue in all_issues[:10]: # Show first 10
|
||||
print(f" - {issue}")
|
||||
if len(all_issues) > 10:
|
||||
print(f" ... and {len(all_issues) - 10} more (use --json for full list)")
|
||||
print()
|
||||
|
||||
# Recommendations
|
||||
print("💡 Recommendations:")
|
||||
for rec in results["recommendations"]:
|
||||
print(f" {rec}")
|
||||
|
||||
sys.exit(0 if results["health_status"] in ["healthy", "degraded"] else 1)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user