Initial commit

This commit is contained in:
Zhongwei Li
2025-11-29 17:51:20 +08:00
commit ad81bc571f
11 changed files with 3746 additions and 0 deletions

223
skills/scripts/cluster_health.py Executable file
View File

@@ -0,0 +1,223 @@
#!/usr/bin/env python3
"""
Cluster Health Check Script
Performs comprehensive cluster health diagnostics
"""
import json
import subprocess
from typing import Dict, List, Any
from datetime import datetime
def run_kubectl(args: List[str]) -> Dict[str, Any]:
"""Run kubectl command and return parsed JSON"""
try:
result = subprocess.run(
['kubectl'] + args,
capture_output=True,
text=True,
check=True
)
return json.loads(result.stdout) if result.stdout else {}
except subprocess.CalledProcessError as e:
return {"error": e.stderr}
except json.JSONDecodeError:
return {"error": "Failed to parse kubectl output"}
def check_nodes() -> Dict[str, Any]:
"""Check node health"""
nodes = run_kubectl(['get', 'nodes', '-o', 'json'])
if 'error' in nodes:
return nodes
results = {
"healthy": 0,
"unhealthy": 0,
"issues": []
}
for node in nodes.get('items', []):
name = node['metadata']['name']
conditions = node.get('status', {}).get('conditions', [])
is_ready = False
for condition in conditions:
if condition['type'] == 'Ready':
is_ready = condition['status'] == 'True'
if not is_ready:
results['unhealthy'] += 1
results['issues'].append(f"Node {name} is not Ready")
else:
results['healthy'] += 1
break
# Check other conditions
for condition in conditions:
if condition['type'] != 'Ready' and condition['status'] == 'True':
results['issues'].append(f"Node {name}: {condition['type']} = {condition['status']}")
return results
def check_system_pods() -> Dict[str, Any]:
"""Check critical system pods"""
namespaces = ['kube-system', 'kube-public', 'kube-node-lease']
results = {
"healthy": 0,
"unhealthy": 0,
"issues": []
}
for ns in namespaces:
pods = run_kubectl(['get', 'pods', '-n', ns, '-o', 'json'])
if 'error' in pods:
continue
for pod in pods.get('items', []):
name = pod['metadata']['name']
phase = pod.get('status', {}).get('phase', 'Unknown')
if phase == 'Running':
# Check if all containers are ready
container_statuses = pod.get('status', {}).get('containerStatuses', [])
all_ready = all(c.get('ready', False) for c in container_statuses)
if all_ready:
results['healthy'] += 1
else:
results['unhealthy'] += 1
results['issues'].append(f"Pod {ns}/{name}: Containers not ready")
elif phase in ['Succeeded', 'Completed']:
results['healthy'] += 1
else:
results['unhealthy'] += 1
results['issues'].append(f"Pod {ns}/{name}: Phase is {phase}")
return results
def check_pending_pods() -> Dict[str, Any]:
"""Check for pods stuck in pending"""
all_pods = run_kubectl(['get', 'pods', '--all-namespaces', '-o', 'json'])
if 'error' in all_pods:
return all_pods
pending = []
for pod in all_pods.get('items', []):
if pod.get('status', {}).get('phase') == 'Pending':
name = pod['metadata']['name']
namespace = pod['metadata']['namespace']
pending.append(f"{namespace}/{name}")
return {"count": len(pending), "pods": pending}
def check_failed_pods() -> Dict[str, Any]:
"""Check for failed pods"""
all_pods = run_kubectl(['get', 'pods', '--all-namespaces', '-o', 'json'])
if 'error' in all_pods:
return all_pods
failed = []
for pod in all_pods.get('items', []):
if pod.get('status', {}).get('phase') == 'Failed':
name = pod['metadata']['name']
namespace = pod['metadata']['namespace']
failed.append(f"{namespace}/{name}")
return {"count": len(failed), "pods": failed}
def check_crashloop_pods() -> Dict[str, Any]:
"""Check for pods in crash loop"""
all_pods = run_kubectl(['get', 'pods', '--all-namespaces', '-o', 'json'])
if 'error' in all_pods:
return all_pods
crashloop = []
for pod in all_pods.get('items', []):
container_statuses = pod.get('status', {}).get('containerStatuses', [])
for container in container_statuses:
state = container.get('state', {})
if 'waiting' in state and 'CrashLoopBackOff' in state['waiting'].get('reason', ''):
name = pod['metadata']['name']
namespace = pod['metadata']['namespace']
container_name = container['name']
crashloop.append(f"{namespace}/{name} (container: {container_name})")
break
return {"count": len(crashloop), "pods": crashloop}
def main():
print("🏥 Kubernetes Cluster Health Check")
print("=" * 60)
print(f"Timestamp: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
# Check nodes
print("🖥️ Node Health:")
nodes = check_nodes()
if 'error' not in nodes:
print(f" ✅ Healthy nodes: {nodes['healthy']}")
if nodes['unhealthy'] > 0:
print(f" ❌ Unhealthy nodes: {nodes['unhealthy']}")
for issue in nodes['issues']:
print(f"{issue}")
else:
print(f" ❌ Error: {nodes['error']}")
print()
# Check system pods
print("🔧 System Pods:")
system = check_system_pods()
if 'error' not in system:
print(f" ✅ Healthy: {system['healthy']}")
if system['unhealthy'] > 0:
print(f" ⚠️ Unhealthy: {system['unhealthy']}")
for issue in system['issues'][:10]: # Show first 10
print(f"{issue}")
else:
print(f" ❌ Error: {system['error']}")
print()
# Check pending pods
print("⏳ Pending Pods:")
pending = check_pending_pods()
if 'error' not in pending:
if pending['count'] == 0:
print(" ✅ No pods stuck in pending")
else:
print(f" ⚠️ {pending['count']} pods in pending state:")
for pod in pending['pods'][:10]:
print(f"{pod}")
else:
print(f" ❌ Error: {pending['error']}")
print()
# Check failed pods
print("💥 Failed Pods:")
failed = check_failed_pods()
if 'error' not in failed:
if failed['count'] == 0:
print(" ✅ No failed pods")
else:
print(f"{failed['count']} pods in failed state:")
for pod in failed['pods'][:10]:
print(f"{pod}")
else:
print(f" ❌ Error: {failed['error']}")
print()
# Check crash loops
print("🔄 Crash Loop Pods:")
crashloop = check_crashloop_pods()
if 'error' not in crashloop:
if crashloop['count'] == 0:
print(" ✅ No pods in crash loop")
else:
print(f"{crashloop['count']} pods in crash loop:")
for pod in crashloop['pods'][:10]:
print(f"{pod}")
else:
print(f" ❌ Error: {crashloop['error']}")
print()
print("=" * 60)
print("Health check complete!")
if __name__ == "__main__":
main()