Initial commit

2025-11-29 17:51:20 +08:00
commit ad81bc571f
11 changed files with 3746 additions and 0 deletions
--- a/skills/scripts/diagnose_pod.py
+++ b/skills/scripts/diagnose_pod.py
@@ -0,0 +1,157 @@
+#!/usr/bin/env python3
+"""
+Comprehensive Pod Diagnostics Script
+Analyzes a pod's health and returns structured diagnostic information
+"""
+import json
+import subprocess
+import sys
+from typing import Dict, List, Any
+
+def run_kubectl(args: List[str]) -> Dict[str, Any]:
+    """Run kubectl command and return parsed JSON"""
+    try:
+        result = subprocess.run(
+            ['kubectl'] + args,
+            capture_output=True,
+            text=True,
+            check=True
+        )
+        return json.loads(result.stdout) if result.stdout else {}
+    except subprocess.CalledProcessError as e:
+        return {"error": e.stderr}
+    except json.JSONDecodeError:
+        return {"error": "Failed to parse kubectl output"}
+
+def check_pod_status(namespace: str, pod: str) -> Dict[str, Any]:
+    """Get pod status and basic info"""
+    return run_kubectl(['get', 'pod', pod, '-n', namespace, '-o', 'json'])
+
+def check_events(namespace: str, pod: str) -> Dict[str, Any]:
+    """Get events related to the pod"""
+    return run_kubectl(['get', 'events', '-n', namespace, 
+                       '--field-selector', f'involvedObject.name={pod}',
+                       '-o', 'json', '--sort-by', '.lastTimestamp'])
+
+def check_resource_usage(namespace: str, pod: str) -> Dict[str, Any]:
+    """Get resource usage if metrics server is available"""
+    result = run_kubectl(['top', 'pod', pod, '-n', namespace])
+    return result
+
+def analyze_pod(pod_data: Dict[str, Any]) -> Dict[str, Any]:
+    """Analyze pod data and identify issues"""
+    issues = []
+    recommendations = []
+    
+    status = pod_data.get('status', {})
+    spec = pod_data.get('spec', {})
+    
+    # Check phase
+    phase = status.get('phase', 'Unknown')
+    if phase not in ['Running', 'Succeeded']:
+        issues.append(f"Pod is in {phase} phase")
+    
+    # Check container statuses
+    container_statuses = status.get('containerStatuses', [])
+    for container in container_statuses:
+        name = container.get('name')
+        ready = container.get('ready', False)
+        
+        if not ready:
+            issues.append(f"Container {name} is not ready")
+            
+        state = container.get('state', {})
+        if 'waiting' in state:
+            reason = state['waiting'].get('reason', 'Unknown')
+            message = state['waiting'].get('message', '')
+            issues.append(f"Container {name} waiting: {reason} - {message}")
+            
+            if reason == 'ImagePullBackOff':
+                recommendations.append("Check image name and registry credentials")
+            elif reason == 'CrashLoopBackOff':
+                recommendations.append(f"Check logs for container {name} to identify crash cause")
+        
+        if 'terminated' in state:
+            reason = state['terminated'].get('reason', 'Unknown')
+            exit_code = state['terminated'].get('exitCode', 0)
+            issues.append(f"Container {name} terminated: {reason} (exit code {exit_code})")
+            
+        restart_count = container.get('restartCount', 0)
+        if restart_count > 5:
+            issues.append(f"Container {name} has restarted {restart_count} times")
+            recommendations.append(f"Investigate crash loops in container {name}")
+    
+    # Check resource requests/limits
+    for container in spec.get('containers', []):
+        resources = container.get('resources', {})
+        if not resources.get('requests'):
+            recommendations.append(f"Consider setting resource requests for container {container.get('name')}")
+        if not resources.get('limits'):
+            recommendations.append(f"Consider setting resource limits for container {container.get('name')}")
+    
+    # Check restart policy
+    restart_policy = spec.get('restartPolicy', 'Always')
+    if restart_policy == 'Never' and issues:
+        recommendations.append("Restart policy is 'Never' - pod won't restart automatically")
+    
+    return {
+        "phase": phase,
+        "issues": issues,
+        "recommendations": recommendations
+    }
+
+def main():
+    if len(sys.argv) != 3:
+        print("Usage: diagnose_pod.py <namespace> <pod-name>")
+        sys.exit(1)
+    
+    namespace = sys.argv[1]
+    pod = sys.argv[2]
+    
+    print(f"🔍 Diagnosing pod: {pod} in namespace: {namespace}\n")
+    
+    # Get pod details
+    pod_data = check_pod_status(namespace, pod)
+    if 'error' in pod_data:
+        print(f"❌ Error fetching pod: {pod_data['error']}")
+        sys.exit(1)
+    
+    # Analyze pod
+    analysis = analyze_pod(pod_data)
+    
+    print(f"📊 Pod Phase: {analysis['phase']}\n")
+    
+    if analysis['issues']:
+        print("⚠️  Issues Found:")
+        for issue in analysis['issues']:
+            print(f"   • {issue}")
+        print()
+    else:
+        print("✅ No issues detected\n")
+    
+    if analysis['recommendations']:
+        print("💡 Recommendations:")
+        for rec in analysis['recommendations']:
+            print(f"   • {rec}")
+        print()
+    
+    # Get events
+    events_data = check_events(namespace, pod)
+    if 'items' in events_data and events_data['items']:
+        print("📋 Recent Events:")
+        for event in events_data['items'][-5:]:  # Last 5 events
+            msg = event.get('message', '')
+            reason = event.get('reason', '')
+            print(f"   • {reason}: {msg}")
+        print()
+    
+    # Try to get resource usage
+    print("📈 Resource Usage:")
+    resource_data = check_resource_usage(namespace, pod)
+    if 'error' not in resource_data:
+        print("   (Run 'kubectl top pod' manually for current usage)")
+    else:
+        print("   Metrics server not available")
+
+if __name__ == "__main__":
+    main()