Initial commit
This commit is contained in:
500
skills/scripts/check_namespace.py
Executable file
500
skills/scripts/check_namespace.py
Executable file
@@ -0,0 +1,500 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Kubernetes Namespace Health Check
|
||||
Performs comprehensive health diagnostics for a specific namespace
|
||||
"""
|
||||
import argparse
|
||||
import json
|
||||
import subprocess
|
||||
import sys
|
||||
from typing import Dict, List, Any
|
||||
from datetime import datetime
|
||||
|
||||
|
||||
def run_kubectl(args: List[str], namespace: str = None) -> Dict[str, Any]:
|
||||
"""Run kubectl command and return parsed JSON"""
|
||||
cmd = ['kubectl'] + args
|
||||
if namespace and '-n' not in args and '--namespace' not in args:
|
||||
cmd.extend(['-n', namespace])
|
||||
|
||||
try:
|
||||
result = subprocess.run(
|
||||
cmd,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
check=True
|
||||
)
|
||||
if result.stdout:
|
||||
return json.loads(result.stdout)
|
||||
return {}
|
||||
except subprocess.CalledProcessError as e:
|
||||
return {"error": e.stderr}
|
||||
except json.JSONDecodeError:
|
||||
return {"error": "Failed to parse kubectl output", "output": result.stdout}
|
||||
|
||||
|
||||
def check_pods(namespace: str) -> Dict[str, Any]:
|
||||
"""Check pod health in namespace"""
|
||||
pods = run_kubectl(['get', 'pods', '-o', 'json'], namespace)
|
||||
|
||||
if 'error' in pods:
|
||||
return pods
|
||||
|
||||
results = {
|
||||
"total": 0,
|
||||
"running": 0,
|
||||
"pending": 0,
|
||||
"failed": 0,
|
||||
"succeeded": 0,
|
||||
"crashlooping": 0,
|
||||
"image_pull_errors": 0,
|
||||
"issues": [],
|
||||
"healthy_pods": [],
|
||||
"unhealthy_pods": []
|
||||
}
|
||||
|
||||
for pod in pods.get('items', []):
|
||||
name = pod['metadata']['name']
|
||||
phase = pod.get('status', {}).get('phase', 'Unknown')
|
||||
results["total"] += 1
|
||||
|
||||
# Check container statuses
|
||||
container_statuses = pod.get('status', {}).get('containerStatuses', [])
|
||||
restart_count = sum(c.get('restartCount', 0) for c in container_statuses)
|
||||
|
||||
# Categorize pod status
|
||||
if phase == 'Running':
|
||||
all_ready = all(c.get('ready', False) for c in container_statuses)
|
||||
if all_ready and restart_count < 5:
|
||||
results["running"] += 1
|
||||
results["healthy_pods"].append(name)
|
||||
else:
|
||||
results["running"] += 1
|
||||
if restart_count >= 5:
|
||||
results["crashlooping"] += 1
|
||||
results["issues"].append(f"Pod {name}: High restart count ({restart_count})")
|
||||
results["unhealthy_pods"].append(name)
|
||||
if not all_ready:
|
||||
results["issues"].append(f"Pod {name}: Not all containers ready")
|
||||
results["unhealthy_pods"].append(name)
|
||||
|
||||
elif phase == 'Pending':
|
||||
results["pending"] += 1
|
||||
results["issues"].append(f"Pod {name}: Stuck in Pending state")
|
||||
results["unhealthy_pods"].append(name)
|
||||
|
||||
elif phase == 'Failed':
|
||||
results["failed"] += 1
|
||||
results["issues"].append(f"Pod {name}: Failed")
|
||||
results["unhealthy_pods"].append(name)
|
||||
|
||||
elif phase == 'Succeeded':
|
||||
results["succeeded"] += 1
|
||||
|
||||
# Check for ImagePullBackOff
|
||||
for container_status in container_statuses:
|
||||
waiting = container_status.get('state', {}).get('waiting', {})
|
||||
reason = waiting.get('reason', '')
|
||||
if 'ImagePull' in reason or 'ErrImagePull' in reason:
|
||||
results["image_pull_errors"] += 1
|
||||
if name not in results["unhealthy_pods"]:
|
||||
results["unhealthy_pods"].append(name)
|
||||
results["issues"].append(f"Pod {name}: {reason}")
|
||||
|
||||
return results
|
||||
|
||||
|
||||
def check_services(namespace: str) -> Dict[str, Any]:
|
||||
"""Check services and their endpoints"""
|
||||
services = run_kubectl(['get', 'services', '-o', 'json'], namespace)
|
||||
|
||||
if 'error' in services:
|
||||
return services
|
||||
|
||||
results = {
|
||||
"total": 0,
|
||||
"with_endpoints": 0,
|
||||
"without_endpoints": 0,
|
||||
"load_balancers": 0,
|
||||
"load_balancers_pending": 0,
|
||||
"issues": []
|
||||
}
|
||||
|
||||
for svc in services.get('items', []):
|
||||
name = svc['metadata']['name']
|
||||
svc_type = svc['spec'].get('type', 'ClusterIP')
|
||||
results["total"] += 1
|
||||
|
||||
# Check endpoints
|
||||
endpoints = run_kubectl(['get', 'endpoints', name, '-o', 'json'], namespace)
|
||||
if 'error' not in endpoints:
|
||||
subsets = endpoints.get('subsets', [])
|
||||
if subsets and any(s.get('addresses', []) for s in subsets):
|
||||
results["with_endpoints"] += 1
|
||||
else:
|
||||
results["without_endpoints"] += 1
|
||||
results["issues"].append(f"Service {name}: No endpoints (no pods matching selector)")
|
||||
|
||||
# Check LoadBalancer status
|
||||
if svc_type == 'LoadBalancer':
|
||||
results["load_balancers"] += 1
|
||||
lb_ingress = svc['status'].get('loadBalancer', {}).get('ingress', [])
|
||||
if not lb_ingress:
|
||||
results["load_balancers_pending"] += 1
|
||||
results["issues"].append(f"Service {name}: LoadBalancer stuck in Pending")
|
||||
|
||||
return results
|
||||
|
||||
|
||||
def check_deployments(namespace: str) -> Dict[str, Any]:
|
||||
"""Check deployment health"""
|
||||
deployments = run_kubectl(['get', 'deployments', '-o', 'json'], namespace)
|
||||
|
||||
if 'error' in deployments:
|
||||
return deployments
|
||||
|
||||
results = {
|
||||
"total": 0,
|
||||
"available": 0,
|
||||
"unavailable": 0,
|
||||
"progressing": 0,
|
||||
"issues": []
|
||||
}
|
||||
|
||||
for deploy in deployments.get('items', []):
|
||||
name = deploy['metadata']['name']
|
||||
results["total"] += 1
|
||||
|
||||
status = deploy.get('status', {})
|
||||
replicas = status.get('replicas', 0)
|
||||
ready_replicas = status.get('readyReplicas', 0)
|
||||
available_replicas = status.get('availableReplicas', 0)
|
||||
|
||||
if available_replicas == replicas and available_replicas > 0:
|
||||
results["available"] += 1
|
||||
elif available_replicas == 0:
|
||||
results["unavailable"] += 1
|
||||
results["issues"].append(f"Deployment {name}: No replicas available ({ready_replicas}/{replicas})")
|
||||
else:
|
||||
results["progressing"] += 1
|
||||
results["issues"].append(f"Deployment {name}: Partially available ({available_replicas}/{replicas})")
|
||||
|
||||
return results
|
||||
|
||||
|
||||
def check_pvcs(namespace: str) -> Dict[str, Any]:
|
||||
"""Check PersistentVolumeClaims"""
|
||||
pvcs = run_kubectl(['get', 'pvc', '-o', 'json'], namespace)
|
||||
|
||||
if 'error' in pvcs:
|
||||
return pvcs
|
||||
|
||||
results = {
|
||||
"total": 0,
|
||||
"bound": 0,
|
||||
"pending": 0,
|
||||
"lost": 0,
|
||||
"issues": []
|
||||
}
|
||||
|
||||
for pvc in pvcs.get('items', []):
|
||||
name = pvc['metadata']['name']
|
||||
phase = pvc.get('status', {}).get('phase', 'Unknown')
|
||||
results["total"] += 1
|
||||
|
||||
if phase == 'Bound':
|
||||
results["bound"] += 1
|
||||
elif phase == 'Pending':
|
||||
results["pending"] += 1
|
||||
results["issues"].append(f"PVC {name}: Stuck in Pending state")
|
||||
elif phase == 'Lost':
|
||||
results["lost"] += 1
|
||||
results["issues"].append(f"PVC {name}: Volume lost")
|
||||
|
||||
return results
|
||||
|
||||
|
||||
def check_resource_quotas(namespace: str) -> Dict[str, Any]:
|
||||
"""Check resource quotas and usage"""
|
||||
quotas = run_kubectl(['get', 'resourcequota', '-o', 'json'], namespace)
|
||||
|
||||
if 'error' in quotas:
|
||||
return {"total": 0, "issues": []}
|
||||
|
||||
results = {
|
||||
"total": 0,
|
||||
"near_limit": [],
|
||||
"exceeded": [],
|
||||
"issues": []
|
||||
}
|
||||
|
||||
for quota in quotas.get('items', []):
|
||||
name = quota['metadata']['name']
|
||||
results["total"] += 1
|
||||
|
||||
status = quota.get('status', {})
|
||||
hard = status.get('hard', {})
|
||||
used = status.get('used', {})
|
||||
|
||||
for resource, limit in hard.items():
|
||||
usage = used.get(resource, '0')
|
||||
|
||||
# Parse values (handle different formats: CPU, memory, counts)
|
||||
try:
|
||||
if resource.endswith('memory'):
|
||||
# Convert to bytes for comparison
|
||||
limit_val = parse_memory(limit)
|
||||
usage_val = parse_memory(usage)
|
||||
elif resource.endswith('cpu'):
|
||||
# Convert to millicores
|
||||
limit_val = parse_cpu(limit)
|
||||
usage_val = parse_cpu(usage)
|
||||
else:
|
||||
# Plain numbers
|
||||
limit_val = int(limit)
|
||||
usage_val = int(usage)
|
||||
|
||||
if limit_val > 0:
|
||||
usage_percent = (usage_val / limit_val) * 100
|
||||
|
||||
if usage_percent >= 100:
|
||||
results["exceeded"].append(resource)
|
||||
results["issues"].append(f"Quota {name}: {resource} exceeded ({usage}/{limit})")
|
||||
elif usage_percent >= 80:
|
||||
results["near_limit"].append(resource)
|
||||
results["issues"].append(f"Quota {name}: {resource} near limit ({usage}/{limit}, {usage_percent:.0f}%)")
|
||||
|
||||
except (ValueError, AttributeError):
|
||||
continue
|
||||
|
||||
return results
|
||||
|
||||
|
||||
def parse_memory(value: str) -> int:
|
||||
"""Parse memory string to bytes"""
|
||||
units = {'Ki': 1024, 'Mi': 1024**2, 'Gi': 1024**3, 'Ti': 1024**4}
|
||||
for unit, multiplier in units.items():
|
||||
if value.endswith(unit):
|
||||
return int(value[:-2]) * multiplier
|
||||
return int(value)
|
||||
|
||||
|
||||
def parse_cpu(value: str) -> int:
|
||||
"""Parse CPU string to millicores"""
|
||||
if value.endswith('m'):
|
||||
return int(value[:-1])
|
||||
return int(float(value) * 1000)
|
||||
|
||||
|
||||
def get_recent_events(namespace: str, limit: int = 10) -> List[Dict[str, Any]]:
|
||||
"""Get recent events in namespace"""
|
||||
events = run_kubectl(['get', 'events', '--sort-by=.lastTimestamp', '-o', 'json'], namespace)
|
||||
|
||||
if 'error' in events:
|
||||
return []
|
||||
|
||||
recent_events = []
|
||||
for event in events.get('items', [])[-limit:]:
|
||||
recent_events.append({
|
||||
"type": event.get('type', 'Unknown'),
|
||||
"reason": event.get('reason', ''),
|
||||
"message": event.get('message', ''),
|
||||
"object": f"{event.get('involvedObject', {}).get('kind', '')}/{event.get('involvedObject', {}).get('name', '')}",
|
||||
"count": event.get('count', 1),
|
||||
"last_timestamp": event.get('lastTimestamp', '')
|
||||
})
|
||||
|
||||
return recent_events
|
||||
|
||||
|
||||
def generate_recommendations(results: Dict[str, Any]) -> List[str]:
|
||||
"""Generate actionable recommendations based on findings"""
|
||||
recommendations = []
|
||||
|
||||
# Pod recommendations
|
||||
if results['pods']['pending'] > 0:
|
||||
recommendations.append("⚠️ Check pending pods with: kubectl describe pod <pod-name> -n <namespace>")
|
||||
recommendations.append("⚠️ Verify node resources: kubectl describe nodes")
|
||||
|
||||
if results['pods']['crashlooping'] > 0:
|
||||
recommendations.append("⚠️ Investigate crashlooping pods: kubectl logs <pod-name> -n <namespace> --previous")
|
||||
|
||||
if results['pods']['image_pull_errors'] > 0:
|
||||
recommendations.append("⚠️ Fix image pull errors: verify image name, check imagePullSecrets")
|
||||
|
||||
# Service recommendations
|
||||
if results['services']['without_endpoints'] > 0:
|
||||
recommendations.append("⚠️ Services without endpoints: check pod selectors match pod labels")
|
||||
|
||||
if results['services']['load_balancers_pending'] > 0:
|
||||
recommendations.append("⚠️ LoadBalancer stuck: check cloud provider controller logs")
|
||||
|
||||
# Deployment recommendations
|
||||
if results['deployments']['unavailable'] > 0:
|
||||
recommendations.append("⚠️ Unavailable deployments: check pod errors and resource availability")
|
||||
|
||||
# PVC recommendations
|
||||
if results['pvcs']['pending'] > 0:
|
||||
recommendations.append("⚠️ Pending PVCs: verify StorageClass exists and provisioner is working")
|
||||
|
||||
# Quota recommendations
|
||||
if results['quotas']['exceeded']:
|
||||
recommendations.append(f"🚨 Resource quotas exceeded: {', '.join(results['quotas']['exceeded'])}")
|
||||
recommendations.append("🚨 Action required: increase quota or reduce resource requests")
|
||||
|
||||
if results['quotas']['near_limit']:
|
||||
recommendations.append(f"⚠️ Near quota limits: {', '.join(results['quotas']['near_limit'])}")
|
||||
|
||||
if not recommendations:
|
||||
recommendations.append("✅ No critical issues detected")
|
||||
|
||||
return recommendations
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Comprehensive health check for a Kubernetes namespace",
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||
epilog="""
|
||||
Examples:
|
||||
# Check namespace with human-readable output
|
||||
%(prog)s my-namespace
|
||||
|
||||
# Output as JSON
|
||||
%(prog)s my-namespace --json
|
||||
|
||||
# Include more events
|
||||
%(prog)s my-namespace --events 20
|
||||
"""
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"namespace",
|
||||
help="Namespace to check"
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--json",
|
||||
action="store_true",
|
||||
help="Output results as JSON"
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--events",
|
||||
type=int,
|
||||
default=10,
|
||||
help="Number of recent events to include (default: 10)"
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# Perform all checks
|
||||
results = {
|
||||
"namespace": args.namespace,
|
||||
"timestamp": datetime.utcnow().isoformat() + "Z",
|
||||
"pods": check_pods(args.namespace),
|
||||
"services": check_services(args.namespace),
|
||||
"deployments": check_deployments(args.namespace),
|
||||
"pvcs": check_pvcs(args.namespace),
|
||||
"quotas": check_resource_quotas(args.namespace),
|
||||
"recent_events": get_recent_events(args.namespace, args.events)
|
||||
}
|
||||
|
||||
# Generate recommendations
|
||||
results["recommendations"] = generate_recommendations(results)
|
||||
|
||||
# Determine overall health
|
||||
total_issues = (
|
||||
len(results["pods"].get("issues", [])) +
|
||||
len(results["services"].get("issues", [])) +
|
||||
len(results["deployments"].get("issues", [])) +
|
||||
len(results["pvcs"].get("issues", [])) +
|
||||
len(results["quotas"].get("issues", []))
|
||||
)
|
||||
|
||||
results["health_status"] = "healthy" if total_issues == 0 else "degraded" if total_issues < 5 else "critical"
|
||||
|
||||
if args.json:
|
||||
print(json.dumps(results, indent=2))
|
||||
else:
|
||||
# Human-readable output
|
||||
print(f"🔍 Namespace Health Check: {args.namespace}")
|
||||
print(f"⏰ Timestamp: {results['timestamp']}")
|
||||
print(f"📊 Overall Status: {results['health_status'].upper()}\n")
|
||||
|
||||
# Pods
|
||||
print("📦 Pods:")
|
||||
print(f" Total: {results['pods']['total']}")
|
||||
print(f" Running: {results['pods']['running']}")
|
||||
print(f" Pending: {results['pods']['pending']}")
|
||||
print(f" Failed: {results['pods']['failed']}")
|
||||
if results['pods']['crashlooping'] > 0:
|
||||
print(f" ⚠️ CrashLooping: {results['pods']['crashlooping']}")
|
||||
if results['pods']['image_pull_errors'] > 0:
|
||||
print(f" ⚠️ ImagePull Errors: {results['pods']['image_pull_errors']}")
|
||||
print()
|
||||
|
||||
# Services
|
||||
print("🌐 Services:")
|
||||
print(f" Total: {results['services']['total']}")
|
||||
print(f" With Endpoints: {results['services']['with_endpoints']}")
|
||||
if results['services']['without_endpoints'] > 0:
|
||||
print(f" ⚠️ Without Endpoints: {results['services']['without_endpoints']}")
|
||||
if results['services']['load_balancers_pending'] > 0:
|
||||
print(f" ⚠️ LB Pending: {results['services']['load_balancers_pending']}")
|
||||
print()
|
||||
|
||||
# Deployments
|
||||
if results['deployments']['total'] > 0:
|
||||
print("🚀 Deployments:")
|
||||
print(f" Total: {results['deployments']['total']}")
|
||||
print(f" Available: {results['deployments']['available']}")
|
||||
if results['deployments']['unavailable'] > 0:
|
||||
print(f" ⚠️ Unavailable: {results['deployments']['unavailable']}")
|
||||
print()
|
||||
|
||||
# PVCs
|
||||
if results['pvcs']['total'] > 0:
|
||||
print("💾 PersistentVolumeClaims:")
|
||||
print(f" Total: {results['pvcs']['total']}")
|
||||
print(f" Bound: {results['pvcs']['bound']}")
|
||||
if results['pvcs']['pending'] > 0:
|
||||
print(f" ⚠️ Pending: {results['pvcs']['pending']}")
|
||||
print()
|
||||
|
||||
# Quotas
|
||||
if results['quotas']['total'] > 0:
|
||||
print("📏 Resource Quotas:")
|
||||
print(f" Total: {results['quotas']['total']}")
|
||||
if results['quotas']['exceeded']:
|
||||
print(f" 🚨 Exceeded: {', '.join(results['quotas']['exceeded'])}")
|
||||
if results['quotas']['near_limit']:
|
||||
print(f" ⚠️ Near Limit: {', '.join(results['quotas']['near_limit'])}")
|
||||
print()
|
||||
|
||||
# Issues
|
||||
if total_issues > 0:
|
||||
print(f"⚠️ Issues ({total_issues}):")
|
||||
all_issues = (
|
||||
results["pods"].get("issues", []) +
|
||||
results["services"].get("issues", []) +
|
||||
results["deployments"].get("issues", []) +
|
||||
results["pvcs"].get("issues", []) +
|
||||
results["quotas"].get("issues", [])
|
||||
)
|
||||
for issue in all_issues[:10]: # Show first 10
|
||||
print(f" - {issue}")
|
||||
if len(all_issues) > 10:
|
||||
print(f" ... and {len(all_issues) - 10} more (use --json for full list)")
|
||||
print()
|
||||
|
||||
# Recommendations
|
||||
print("💡 Recommendations:")
|
||||
for rec in results["recommendations"]:
|
||||
print(f" {rec}")
|
||||
|
||||
sys.exit(0 if results["health_status"] in ["healthy", "degraded"] else 1)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
223
skills/scripts/cluster_health.py
Executable file
223
skills/scripts/cluster_health.py
Executable file
@@ -0,0 +1,223 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Cluster Health Check Script
|
||||
Performs comprehensive cluster health diagnostics
|
||||
"""
|
||||
import json
|
||||
import subprocess
|
||||
from typing import Dict, List, Any
|
||||
from datetime import datetime
|
||||
|
||||
def run_kubectl(args: List[str]) -> Dict[str, Any]:
|
||||
"""Run kubectl command and return parsed JSON"""
|
||||
try:
|
||||
result = subprocess.run(
|
||||
['kubectl'] + args,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
check=True
|
||||
)
|
||||
return json.loads(result.stdout) if result.stdout else {}
|
||||
except subprocess.CalledProcessError as e:
|
||||
return {"error": e.stderr}
|
||||
except json.JSONDecodeError:
|
||||
return {"error": "Failed to parse kubectl output"}
|
||||
|
||||
def check_nodes() -> Dict[str, Any]:
|
||||
"""Check node health"""
|
||||
nodes = run_kubectl(['get', 'nodes', '-o', 'json'])
|
||||
if 'error' in nodes:
|
||||
return nodes
|
||||
|
||||
results = {
|
||||
"healthy": 0,
|
||||
"unhealthy": 0,
|
||||
"issues": []
|
||||
}
|
||||
|
||||
for node in nodes.get('items', []):
|
||||
name = node['metadata']['name']
|
||||
conditions = node.get('status', {}).get('conditions', [])
|
||||
|
||||
is_ready = False
|
||||
for condition in conditions:
|
||||
if condition['type'] == 'Ready':
|
||||
is_ready = condition['status'] == 'True'
|
||||
if not is_ready:
|
||||
results['unhealthy'] += 1
|
||||
results['issues'].append(f"Node {name} is not Ready")
|
||||
else:
|
||||
results['healthy'] += 1
|
||||
break
|
||||
|
||||
# Check other conditions
|
||||
for condition in conditions:
|
||||
if condition['type'] != 'Ready' and condition['status'] == 'True':
|
||||
results['issues'].append(f"Node {name}: {condition['type']} = {condition['status']}")
|
||||
|
||||
return results
|
||||
|
||||
def check_system_pods() -> Dict[str, Any]:
|
||||
"""Check critical system pods"""
|
||||
namespaces = ['kube-system', 'kube-public', 'kube-node-lease']
|
||||
results = {
|
||||
"healthy": 0,
|
||||
"unhealthy": 0,
|
||||
"issues": []
|
||||
}
|
||||
|
||||
for ns in namespaces:
|
||||
pods = run_kubectl(['get', 'pods', '-n', ns, '-o', 'json'])
|
||||
if 'error' in pods:
|
||||
continue
|
||||
|
||||
for pod in pods.get('items', []):
|
||||
name = pod['metadata']['name']
|
||||
phase = pod.get('status', {}).get('phase', 'Unknown')
|
||||
|
||||
if phase == 'Running':
|
||||
# Check if all containers are ready
|
||||
container_statuses = pod.get('status', {}).get('containerStatuses', [])
|
||||
all_ready = all(c.get('ready', False) for c in container_statuses)
|
||||
|
||||
if all_ready:
|
||||
results['healthy'] += 1
|
||||
else:
|
||||
results['unhealthy'] += 1
|
||||
results['issues'].append(f"Pod {ns}/{name}: Containers not ready")
|
||||
elif phase in ['Succeeded', 'Completed']:
|
||||
results['healthy'] += 1
|
||||
else:
|
||||
results['unhealthy'] += 1
|
||||
results['issues'].append(f"Pod {ns}/{name}: Phase is {phase}")
|
||||
|
||||
return results
|
||||
|
||||
def check_pending_pods() -> Dict[str, Any]:
|
||||
"""Check for pods stuck in pending"""
|
||||
all_pods = run_kubectl(['get', 'pods', '--all-namespaces', '-o', 'json'])
|
||||
if 'error' in all_pods:
|
||||
return all_pods
|
||||
|
||||
pending = []
|
||||
for pod in all_pods.get('items', []):
|
||||
if pod.get('status', {}).get('phase') == 'Pending':
|
||||
name = pod['metadata']['name']
|
||||
namespace = pod['metadata']['namespace']
|
||||
pending.append(f"{namespace}/{name}")
|
||||
|
||||
return {"count": len(pending), "pods": pending}
|
||||
|
||||
def check_failed_pods() -> Dict[str, Any]:
|
||||
"""Check for failed pods"""
|
||||
all_pods = run_kubectl(['get', 'pods', '--all-namespaces', '-o', 'json'])
|
||||
if 'error' in all_pods:
|
||||
return all_pods
|
||||
|
||||
failed = []
|
||||
for pod in all_pods.get('items', []):
|
||||
if pod.get('status', {}).get('phase') == 'Failed':
|
||||
name = pod['metadata']['name']
|
||||
namespace = pod['metadata']['namespace']
|
||||
failed.append(f"{namespace}/{name}")
|
||||
|
||||
return {"count": len(failed), "pods": failed}
|
||||
|
||||
def check_crashloop_pods() -> Dict[str, Any]:
|
||||
"""Check for pods in crash loop"""
|
||||
all_pods = run_kubectl(['get', 'pods', '--all-namespaces', '-o', 'json'])
|
||||
if 'error' in all_pods:
|
||||
return all_pods
|
||||
|
||||
crashloop = []
|
||||
for pod in all_pods.get('items', []):
|
||||
container_statuses = pod.get('status', {}).get('containerStatuses', [])
|
||||
for container in container_statuses:
|
||||
state = container.get('state', {})
|
||||
if 'waiting' in state and 'CrashLoopBackOff' in state['waiting'].get('reason', ''):
|
||||
name = pod['metadata']['name']
|
||||
namespace = pod['metadata']['namespace']
|
||||
container_name = container['name']
|
||||
crashloop.append(f"{namespace}/{name} (container: {container_name})")
|
||||
break
|
||||
|
||||
return {"count": len(crashloop), "pods": crashloop}
|
||||
|
||||
def main():
|
||||
print("🏥 Kubernetes Cluster Health Check")
|
||||
print("=" * 60)
|
||||
print(f"Timestamp: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
|
||||
|
||||
# Check nodes
|
||||
print("🖥️ Node Health:")
|
||||
nodes = check_nodes()
|
||||
if 'error' not in nodes:
|
||||
print(f" ✅ Healthy nodes: {nodes['healthy']}")
|
||||
if nodes['unhealthy'] > 0:
|
||||
print(f" ❌ Unhealthy nodes: {nodes['unhealthy']}")
|
||||
for issue in nodes['issues']:
|
||||
print(f" • {issue}")
|
||||
else:
|
||||
print(f" ❌ Error: {nodes['error']}")
|
||||
print()
|
||||
|
||||
# Check system pods
|
||||
print("🔧 System Pods:")
|
||||
system = check_system_pods()
|
||||
if 'error' not in system:
|
||||
print(f" ✅ Healthy: {system['healthy']}")
|
||||
if system['unhealthy'] > 0:
|
||||
print(f" ⚠️ Unhealthy: {system['unhealthy']}")
|
||||
for issue in system['issues'][:10]: # Show first 10
|
||||
print(f" • {issue}")
|
||||
else:
|
||||
print(f" ❌ Error: {system['error']}")
|
||||
print()
|
||||
|
||||
# Check pending pods
|
||||
print("⏳ Pending Pods:")
|
||||
pending = check_pending_pods()
|
||||
if 'error' not in pending:
|
||||
if pending['count'] == 0:
|
||||
print(" ✅ No pods stuck in pending")
|
||||
else:
|
||||
print(f" ⚠️ {pending['count']} pods in pending state:")
|
||||
for pod in pending['pods'][:10]:
|
||||
print(f" • {pod}")
|
||||
else:
|
||||
print(f" ❌ Error: {pending['error']}")
|
||||
print()
|
||||
|
||||
# Check failed pods
|
||||
print("💥 Failed Pods:")
|
||||
failed = check_failed_pods()
|
||||
if 'error' not in failed:
|
||||
if failed['count'] == 0:
|
||||
print(" ✅ No failed pods")
|
||||
else:
|
||||
print(f" ❌ {failed['count']} pods in failed state:")
|
||||
for pod in failed['pods'][:10]:
|
||||
print(f" • {pod}")
|
||||
else:
|
||||
print(f" ❌ Error: {failed['error']}")
|
||||
print()
|
||||
|
||||
# Check crash loops
|
||||
print("🔄 Crash Loop Pods:")
|
||||
crashloop = check_crashloop_pods()
|
||||
if 'error' not in crashloop:
|
||||
if crashloop['count'] == 0:
|
||||
print(" ✅ No pods in crash loop")
|
||||
else:
|
||||
print(f" ❌ {crashloop['count']} pods in crash loop:")
|
||||
for pod in crashloop['pods'][:10]:
|
||||
print(f" • {pod}")
|
||||
else:
|
||||
print(f" ❌ Error: {crashloop['error']}")
|
||||
print()
|
||||
|
||||
print("=" * 60)
|
||||
print("Health check complete!")
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
157
skills/scripts/diagnose_pod.py
Executable file
157
skills/scripts/diagnose_pod.py
Executable file
@@ -0,0 +1,157 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Comprehensive Pod Diagnostics Script
|
||||
Analyzes a pod's health and returns structured diagnostic information
|
||||
"""
|
||||
import json
|
||||
import subprocess
|
||||
import sys
|
||||
from typing import Dict, List, Any
|
||||
|
||||
def run_kubectl(args: List[str]) -> Dict[str, Any]:
|
||||
"""Run kubectl command and return parsed JSON"""
|
||||
try:
|
||||
result = subprocess.run(
|
||||
['kubectl'] + args,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
check=True
|
||||
)
|
||||
return json.loads(result.stdout) if result.stdout else {}
|
||||
except subprocess.CalledProcessError as e:
|
||||
return {"error": e.stderr}
|
||||
except json.JSONDecodeError:
|
||||
return {"error": "Failed to parse kubectl output"}
|
||||
|
||||
def check_pod_status(namespace: str, pod: str) -> Dict[str, Any]:
|
||||
"""Get pod status and basic info"""
|
||||
return run_kubectl(['get', 'pod', pod, '-n', namespace, '-o', 'json'])
|
||||
|
||||
def check_events(namespace: str, pod: str) -> Dict[str, Any]:
|
||||
"""Get events related to the pod"""
|
||||
return run_kubectl(['get', 'events', '-n', namespace,
|
||||
'--field-selector', f'involvedObject.name={pod}',
|
||||
'-o', 'json', '--sort-by', '.lastTimestamp'])
|
||||
|
||||
def check_resource_usage(namespace: str, pod: str) -> Dict[str, Any]:
|
||||
"""Get resource usage if metrics server is available"""
|
||||
result = run_kubectl(['top', 'pod', pod, '-n', namespace])
|
||||
return result
|
||||
|
||||
def analyze_pod(pod_data: Dict[str, Any]) -> Dict[str, Any]:
|
||||
"""Analyze pod data and identify issues"""
|
||||
issues = []
|
||||
recommendations = []
|
||||
|
||||
status = pod_data.get('status', {})
|
||||
spec = pod_data.get('spec', {})
|
||||
|
||||
# Check phase
|
||||
phase = status.get('phase', 'Unknown')
|
||||
if phase not in ['Running', 'Succeeded']:
|
||||
issues.append(f"Pod is in {phase} phase")
|
||||
|
||||
# Check container statuses
|
||||
container_statuses = status.get('containerStatuses', [])
|
||||
for container in container_statuses:
|
||||
name = container.get('name')
|
||||
ready = container.get('ready', False)
|
||||
|
||||
if not ready:
|
||||
issues.append(f"Container {name} is not ready")
|
||||
|
||||
state = container.get('state', {})
|
||||
if 'waiting' in state:
|
||||
reason = state['waiting'].get('reason', 'Unknown')
|
||||
message = state['waiting'].get('message', '')
|
||||
issues.append(f"Container {name} waiting: {reason} - {message}")
|
||||
|
||||
if reason == 'ImagePullBackOff':
|
||||
recommendations.append("Check image name and registry credentials")
|
||||
elif reason == 'CrashLoopBackOff':
|
||||
recommendations.append(f"Check logs for container {name} to identify crash cause")
|
||||
|
||||
if 'terminated' in state:
|
||||
reason = state['terminated'].get('reason', 'Unknown')
|
||||
exit_code = state['terminated'].get('exitCode', 0)
|
||||
issues.append(f"Container {name} terminated: {reason} (exit code {exit_code})")
|
||||
|
||||
restart_count = container.get('restartCount', 0)
|
||||
if restart_count > 5:
|
||||
issues.append(f"Container {name} has restarted {restart_count} times")
|
||||
recommendations.append(f"Investigate crash loops in container {name}")
|
||||
|
||||
# Check resource requests/limits
|
||||
for container in spec.get('containers', []):
|
||||
resources = container.get('resources', {})
|
||||
if not resources.get('requests'):
|
||||
recommendations.append(f"Consider setting resource requests for container {container.get('name')}")
|
||||
if not resources.get('limits'):
|
||||
recommendations.append(f"Consider setting resource limits for container {container.get('name')}")
|
||||
|
||||
# Check restart policy
|
||||
restart_policy = spec.get('restartPolicy', 'Always')
|
||||
if restart_policy == 'Never' and issues:
|
||||
recommendations.append("Restart policy is 'Never' - pod won't restart automatically")
|
||||
|
||||
return {
|
||||
"phase": phase,
|
||||
"issues": issues,
|
||||
"recommendations": recommendations
|
||||
}
|
||||
|
||||
def main():
|
||||
if len(sys.argv) != 3:
|
||||
print("Usage: diagnose_pod.py <namespace> <pod-name>")
|
||||
sys.exit(1)
|
||||
|
||||
namespace = sys.argv[1]
|
||||
pod = sys.argv[2]
|
||||
|
||||
print(f"🔍 Diagnosing pod: {pod} in namespace: {namespace}\n")
|
||||
|
||||
# Get pod details
|
||||
pod_data = check_pod_status(namespace, pod)
|
||||
if 'error' in pod_data:
|
||||
print(f"❌ Error fetching pod: {pod_data['error']}")
|
||||
sys.exit(1)
|
||||
|
||||
# Analyze pod
|
||||
analysis = analyze_pod(pod_data)
|
||||
|
||||
print(f"📊 Pod Phase: {analysis['phase']}\n")
|
||||
|
||||
if analysis['issues']:
|
||||
print("⚠️ Issues Found:")
|
||||
for issue in analysis['issues']:
|
||||
print(f" • {issue}")
|
||||
print()
|
||||
else:
|
||||
print("✅ No issues detected\n")
|
||||
|
||||
if analysis['recommendations']:
|
||||
print("💡 Recommendations:")
|
||||
for rec in analysis['recommendations']:
|
||||
print(f" • {rec}")
|
||||
print()
|
||||
|
||||
# Get events
|
||||
events_data = check_events(namespace, pod)
|
||||
if 'items' in events_data and events_data['items']:
|
||||
print("📋 Recent Events:")
|
||||
for event in events_data['items'][-5:]: # Last 5 events
|
||||
msg = event.get('message', '')
|
||||
reason = event.get('reason', '')
|
||||
print(f" • {reason}: {msg}")
|
||||
print()
|
||||
|
||||
# Try to get resource usage
|
||||
print("📈 Resource Usage:")
|
||||
resource_data = check_resource_usage(namespace, pod)
|
||||
if 'error' not in resource_data:
|
||||
print(" (Run 'kubectl top pod' manually for current usage)")
|
||||
else:
|
||||
print(" Metrics server not available")
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user