#!/usr/bin/env python3 """ Comprehensive Pod Diagnostics Script Analyzes a pod's health and returns structured diagnostic information """ import json import subprocess import sys from typing import Dict, List, Any def run_kubectl(args: List[str]) -> Dict[str, Any]: """Run kubectl command and return parsed JSON""" try: result = subprocess.run( ['kubectl'] + args, capture_output=True, text=True, check=True ) return json.loads(result.stdout) if result.stdout else {} except subprocess.CalledProcessError as e: return {"error": e.stderr} except json.JSONDecodeError: return {"error": "Failed to parse kubectl output"} def check_pod_status(namespace: str, pod: str) -> Dict[str, Any]: """Get pod status and basic info""" return run_kubectl(['get', 'pod', pod, '-n', namespace, '-o', 'json']) def check_events(namespace: str, pod: str) -> Dict[str, Any]: """Get events related to the pod""" return run_kubectl(['get', 'events', '-n', namespace, '--field-selector', f'involvedObject.name={pod}', '-o', 'json', '--sort-by', '.lastTimestamp']) def check_resource_usage(namespace: str, pod: str) -> Dict[str, Any]: """Get resource usage if metrics server is available""" result = run_kubectl(['top', 'pod', pod, '-n', namespace]) return result def analyze_pod(pod_data: Dict[str, Any]) -> Dict[str, Any]: """Analyze pod data and identify issues""" issues = [] recommendations = [] status = pod_data.get('status', {}) spec = pod_data.get('spec', {}) # Check phase phase = status.get('phase', 'Unknown') if phase not in ['Running', 'Succeeded']: issues.append(f"Pod is in {phase} phase") # Check container statuses container_statuses = status.get('containerStatuses', []) for container in container_statuses: name = container.get('name') ready = container.get('ready', False) if not ready: issues.append(f"Container {name} is not ready") state = container.get('state', {}) if 'waiting' in state: reason = state['waiting'].get('reason', 'Unknown') message = state['waiting'].get('message', '') issues.append(f"Container {name} waiting: {reason} - {message}") if reason == 'ImagePullBackOff': recommendations.append("Check image name and registry credentials") elif reason == 'CrashLoopBackOff': recommendations.append(f"Check logs for container {name} to identify crash cause") if 'terminated' in state: reason = state['terminated'].get('reason', 'Unknown') exit_code = state['terminated'].get('exitCode', 0) issues.append(f"Container {name} terminated: {reason} (exit code {exit_code})") restart_count = container.get('restartCount', 0) if restart_count > 5: issues.append(f"Container {name} has restarted {restart_count} times") recommendations.append(f"Investigate crash loops in container {name}") # Check resource requests/limits for container in spec.get('containers', []): resources = container.get('resources', {}) if not resources.get('requests'): recommendations.append(f"Consider setting resource requests for container {container.get('name')}") if not resources.get('limits'): recommendations.append(f"Consider setting resource limits for container {container.get('name')}") # Check restart policy restart_policy = spec.get('restartPolicy', 'Always') if restart_policy == 'Never' and issues: recommendations.append("Restart policy is 'Never' - pod won't restart automatically") return { "phase": phase, "issues": issues, "recommendations": recommendations } def main(): if len(sys.argv) != 3: print("Usage: diagnose_pod.py ") sys.exit(1) namespace = sys.argv[1] pod = sys.argv[2] print(f"🔍 Diagnosing pod: {pod} in namespace: {namespace}\n") # Get pod details pod_data = check_pod_status(namespace, pod) if 'error' in pod_data: print(f"❌ Error fetching pod: {pod_data['error']}") sys.exit(1) # Analyze pod analysis = analyze_pod(pod_data) print(f"📊 Pod Phase: {analysis['phase']}\n") if analysis['issues']: print("⚠️ Issues Found:") for issue in analysis['issues']: print(f" • {issue}") print() else: print("✅ No issues detected\n") if analysis['recommendations']: print("💡 Recommendations:") for rec in analysis['recommendations']: print(f" • {rec}") print() # Get events events_data = check_events(namespace, pod) if 'items' in events_data and events_data['items']: print("📋 Recent Events:") for event in events_data['items'][-5:]: # Last 5 events msg = event.get('message', '') reason = event.get('reason', '') print(f" • {reason}: {msg}") print() # Try to get resource usage print("📈 Resource Usage:") resource_data = check_resource_usage(namespace, pod) if 'error' not in resource_data: print(" (Run 'kubectl top pod' manually for current usage)") else: print(" Metrics server not available") if __name__ == "__main__": main()