Initial commit

2025-11-29 17:51:20 +08:00
commit ad81bc571f
11 changed files with 3746 additions and 0 deletions
--- a/skills/scripts/check_namespace.py
+++ b/skills/scripts/check_namespace.py
@@ -0,0 +1,500 @@
+#!/usr/bin/env python3
+"""
+Kubernetes Namespace Health Check
+Performs comprehensive health diagnostics for a specific namespace
+"""
+import argparse
+import json
+import subprocess
+import sys
+from typing import Dict, List, Any
+from datetime import datetime
+
+
+def run_kubectl(args: List[str], namespace: str = None) -> Dict[str, Any]:
+    """Run kubectl command and return parsed JSON"""
+    cmd = ['kubectl'] + args
+    if namespace and '-n' not in args and '--namespace' not in args:
+        cmd.extend(['-n', namespace])
+
+    try:
+        result = subprocess.run(
+            cmd,
+            capture_output=True,
+            text=True,
+            check=True
+        )
+        if result.stdout:
+            return json.loads(result.stdout)
+        return {}
+    except subprocess.CalledProcessError as e:
+        return {"error": e.stderr}
+    except json.JSONDecodeError:
+        return {"error": "Failed to parse kubectl output", "output": result.stdout}
+
+
+def check_pods(namespace: str) -> Dict[str, Any]:
+    """Check pod health in namespace"""
+    pods = run_kubectl(['get', 'pods', '-o', 'json'], namespace)
+
+    if 'error' in pods:
+        return pods
+
+    results = {
+        "total": 0,
+        "running": 0,
+        "pending": 0,
+        "failed": 0,
+        "succeeded": 0,
+        "crashlooping": 0,
+        "image_pull_errors": 0,
+        "issues": [],
+        "healthy_pods": [],
+        "unhealthy_pods": []
+    }
+
+    for pod in pods.get('items', []):
+        name = pod['metadata']['name']
+        phase = pod.get('status', {}).get('phase', 'Unknown')
+        results["total"] += 1
+
+        # Check container statuses
+        container_statuses = pod.get('status', {}).get('containerStatuses', [])
+        restart_count = sum(c.get('restartCount', 0) for c in container_statuses)
+
+        # Categorize pod status
+        if phase == 'Running':
+            all_ready = all(c.get('ready', False) for c in container_statuses)
+            if all_ready and restart_count < 5:
+                results["running"] += 1
+                results["healthy_pods"].append(name)
+            else:
+                results["running"] += 1
+                if restart_count >= 5:
+                    results["crashlooping"] += 1
+                    results["issues"].append(f"Pod {name}: High restart count ({restart_count})")
+                    results["unhealthy_pods"].append(name)
+                if not all_ready:
+                    results["issues"].append(f"Pod {name}: Not all containers ready")
+                    results["unhealthy_pods"].append(name)
+
+        elif phase == 'Pending':
+            results["pending"] += 1
+            results["issues"].append(f"Pod {name}: Stuck in Pending state")
+            results["unhealthy_pods"].append(name)
+
+        elif phase == 'Failed':
+            results["failed"] += 1
+            results["issues"].append(f"Pod {name}: Failed")
+            results["unhealthy_pods"].append(name)
+
+        elif phase == 'Succeeded':
+            results["succeeded"] += 1
+
+        # Check for ImagePullBackOff
+        for container_status in container_statuses:
+            waiting = container_status.get('state', {}).get('waiting', {})
+            reason = waiting.get('reason', '')
+            if 'ImagePull' in reason or 'ErrImagePull' in reason:
+                results["image_pull_errors"] += 1
+                if name not in results["unhealthy_pods"]:
+                    results["unhealthy_pods"].append(name)
+                results["issues"].append(f"Pod {name}: {reason}")
+
+    return results
+
+
+def check_services(namespace: str) -> Dict[str, Any]:
+    """Check services and their endpoints"""
+    services = run_kubectl(['get', 'services', '-o', 'json'], namespace)
+
+    if 'error' in services:
+        return services
+
+    results = {
+        "total": 0,
+        "with_endpoints": 0,
+        "without_endpoints": 0,
+        "load_balancers": 0,
+        "load_balancers_pending": 0,
+        "issues": []
+    }
+
+    for svc in services.get('items', []):
+        name = svc['metadata']['name']
+        svc_type = svc['spec'].get('type', 'ClusterIP')
+        results["total"] += 1
+
+        # Check endpoints
+        endpoints = run_kubectl(['get', 'endpoints', name, '-o', 'json'], namespace)
+        if 'error' not in endpoints:
+            subsets = endpoints.get('subsets', [])
+            if subsets and any(s.get('addresses', []) for s in subsets):
+                results["with_endpoints"] += 1
+            else:
+                results["without_endpoints"] += 1
+                results["issues"].append(f"Service {name}: No endpoints (no pods matching selector)")
+
+        # Check LoadBalancer status
+        if svc_type == 'LoadBalancer':
+            results["load_balancers"] += 1
+            lb_ingress = svc['status'].get('loadBalancer', {}).get('ingress', [])
+            if not lb_ingress:
+                results["load_balancers_pending"] += 1
+                results["issues"].append(f"Service {name}: LoadBalancer stuck in Pending")
+
+    return results
+
+
+def check_deployments(namespace: str) -> Dict[str, Any]:
+    """Check deployment health"""
+    deployments = run_kubectl(['get', 'deployments', '-o', 'json'], namespace)
+
+    if 'error' in deployments:
+        return deployments
+
+    results = {
+        "total": 0,
+        "available": 0,
+        "unavailable": 0,
+        "progressing": 0,
+        "issues": []
+    }
+
+    for deploy in deployments.get('items', []):
+        name = deploy['metadata']['name']
+        results["total"] += 1
+
+        status = deploy.get('status', {})
+        replicas = status.get('replicas', 0)
+        ready_replicas = status.get('readyReplicas', 0)
+        available_replicas = status.get('availableReplicas', 0)
+
+        if available_replicas == replicas and available_replicas > 0:
+            results["available"] += 1
+        elif available_replicas == 0:
+            results["unavailable"] += 1
+            results["issues"].append(f"Deployment {name}: No replicas available ({ready_replicas}/{replicas})")
+        else:
+            results["progressing"] += 1
+            results["issues"].append(f"Deployment {name}: Partially available ({available_replicas}/{replicas})")
+
+    return results
+
+
+def check_pvcs(namespace: str) -> Dict[str, Any]:
+    """Check PersistentVolumeClaims"""
+    pvcs = run_kubectl(['get', 'pvc', '-o', 'json'], namespace)
+
+    if 'error' in pvcs:
+        return pvcs
+
+    results = {
+        "total": 0,
+        "bound": 0,
+        "pending": 0,
+        "lost": 0,
+        "issues": []
+    }
+
+    for pvc in pvcs.get('items', []):
+        name = pvc['metadata']['name']
+        phase = pvc.get('status', {}).get('phase', 'Unknown')
+        results["total"] += 1
+
+        if phase == 'Bound':
+            results["bound"] += 1
+        elif phase == 'Pending':
+            results["pending"] += 1
+            results["issues"].append(f"PVC {name}: Stuck in Pending state")
+        elif phase == 'Lost':
+            results["lost"] += 1
+            results["issues"].append(f"PVC {name}: Volume lost")
+
+    return results
+
+
+def check_resource_quotas(namespace: str) -> Dict[str, Any]:
+    """Check resource quotas and usage"""
+    quotas = run_kubectl(['get', 'resourcequota', '-o', 'json'], namespace)
+
+    if 'error' in quotas:
+        return {"total": 0, "issues": []}
+
+    results = {
+        "total": 0,
+        "near_limit": [],
+        "exceeded": [],
+        "issues": []
+    }
+
+    for quota in quotas.get('items', []):
+        name = quota['metadata']['name']
+        results["total"] += 1
+
+        status = quota.get('status', {})
+        hard = status.get('hard', {})
+        used = status.get('used', {})
+
+        for resource, limit in hard.items():
+            usage = used.get(resource, '0')
+
+            # Parse values (handle different formats: CPU, memory, counts)
+            try:
+                if resource.endswith('memory'):
+                    # Convert to bytes for comparison
+                    limit_val = parse_memory(limit)
+                    usage_val = parse_memory(usage)
+                elif resource.endswith('cpu'):
+                    # Convert to millicores
+                    limit_val = parse_cpu(limit)
+                    usage_val = parse_cpu(usage)
+                else:
+                    # Plain numbers
+                    limit_val = int(limit)
+                    usage_val = int(usage)
+
+                if limit_val > 0:
+                    usage_percent = (usage_val / limit_val) * 100
+
+                    if usage_percent >= 100:
+                        results["exceeded"].append(resource)
+                        results["issues"].append(f"Quota {name}: {resource} exceeded ({usage}/{limit})")
+                    elif usage_percent >= 80:
+                        results["near_limit"].append(resource)
+                        results["issues"].append(f"Quota {name}: {resource} near limit ({usage}/{limit}, {usage_percent:.0f}%)")
+
+            except (ValueError, AttributeError):
+                continue
+
+    return results
+
+
+def parse_memory(value: str) -> int:
+    """Parse memory string to bytes"""
+    units = {'Ki': 1024, 'Mi': 1024**2, 'Gi': 1024**3, 'Ti': 1024**4}
+    for unit, multiplier in units.items():
+        if value.endswith(unit):
+            return int(value[:-2]) * multiplier
+    return int(value)
+
+
+def parse_cpu(value: str) -> int:
+    """Parse CPU string to millicores"""
+    if value.endswith('m'):
+        return int(value[:-1])
+    return int(float(value) * 1000)
+
+
+def get_recent_events(namespace: str, limit: int = 10) -> List[Dict[str, Any]]:
+    """Get recent events in namespace"""
+    events = run_kubectl(['get', 'events', '--sort-by=.lastTimestamp', '-o', 'json'], namespace)
+
+    if 'error' in events:
+        return []
+
+    recent_events = []
+    for event in events.get('items', [])[-limit:]:
+        recent_events.append({
+            "type": event.get('type', 'Unknown'),
+            "reason": event.get('reason', ''),
+            "message": event.get('message', ''),
+            "object": f"{event.get('involvedObject', {}).get('kind', '')}/{event.get('involvedObject', {}).get('name', '')}",
+            "count": event.get('count', 1),
+            "last_timestamp": event.get('lastTimestamp', '')
+        })
+
+    return recent_events
+
+
+def generate_recommendations(results: Dict[str, Any]) -> List[str]:
+    """Generate actionable recommendations based on findings"""
+    recommendations = []
+
+    # Pod recommendations
+    if results['pods']['pending'] > 0:
+        recommendations.append("⚠️  Check pending pods with: kubectl describe pod <pod-name> -n <namespace>")
+        recommendations.append("⚠️  Verify node resources: kubectl describe nodes")
+
+    if results['pods']['crashlooping'] > 0:
+        recommendations.append("⚠️  Investigate crashlooping pods: kubectl logs <pod-name> -n <namespace> --previous")
+
+    if results['pods']['image_pull_errors'] > 0:
+        recommendations.append("⚠️  Fix image pull errors: verify image name, check imagePullSecrets")
+
+    # Service recommendations
+    if results['services']['without_endpoints'] > 0:
+        recommendations.append("⚠️  Services without endpoints: check pod selectors match pod labels")
+
+    if results['services']['load_balancers_pending'] > 0:
+        recommendations.append("⚠️  LoadBalancer stuck: check cloud provider controller logs")
+
+    # Deployment recommendations
+    if results['deployments']['unavailable'] > 0:
+        recommendations.append("⚠️  Unavailable deployments: check pod errors and resource availability")
+
+    # PVC recommendations
+    if results['pvcs']['pending'] > 0:
+        recommendations.append("⚠️  Pending PVCs: verify StorageClass exists and provisioner is working")
+
+    # Quota recommendations
+    if results['quotas']['exceeded']:
+        recommendations.append(f"🚨 Resource quotas exceeded: {', '.join(results['quotas']['exceeded'])}")
+        recommendations.append("🚨 Action required: increase quota or reduce resource requests")
+
+    if results['quotas']['near_limit']:
+        recommendations.append(f"⚠️  Near quota limits: {', '.join(results['quotas']['near_limit'])}")
+
+    if not recommendations:
+        recommendations.append("✅ No critical issues detected")
+
+    return recommendations
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Comprehensive health check for a Kubernetes namespace",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog="""
+Examples:
+  # Check namespace with human-readable output
+  %(prog)s my-namespace
+
+  # Output as JSON
+  %(prog)s my-namespace --json
+
+  # Include more events
+  %(prog)s my-namespace --events 20
+        """
+    )
+
+    parser.add_argument(
+        "namespace",
+        help="Namespace to check"
+    )
+
+    parser.add_argument(
+        "--json",
+        action="store_true",
+        help="Output results as JSON"
+    )
+
+    parser.add_argument(
+        "--events",
+        type=int,
+        default=10,
+        help="Number of recent events to include (default: 10)"
+    )
+
+    args = parser.parse_args()
+
+    # Perform all checks
+    results = {
+        "namespace": args.namespace,
+        "timestamp": datetime.utcnow().isoformat() + "Z",
+        "pods": check_pods(args.namespace),
+        "services": check_services(args.namespace),
+        "deployments": check_deployments(args.namespace),
+        "pvcs": check_pvcs(args.namespace),
+        "quotas": check_resource_quotas(args.namespace),
+        "recent_events": get_recent_events(args.namespace, args.events)
+    }
+
+    # Generate recommendations
+    results["recommendations"] = generate_recommendations(results)
+
+    # Determine overall health
+    total_issues = (
+        len(results["pods"].get("issues", [])) +
+        len(results["services"].get("issues", [])) +
+        len(results["deployments"].get("issues", [])) +
+        len(results["pvcs"].get("issues", [])) +
+        len(results["quotas"].get("issues", []))
+    )
+
+    results["health_status"] = "healthy" if total_issues == 0 else "degraded" if total_issues < 5 else "critical"
+
+    if args.json:
+        print(json.dumps(results, indent=2))
+    else:
+        # Human-readable output
+        print(f"🔍 Namespace Health Check: {args.namespace}")
+        print(f"⏰ Timestamp: {results['timestamp']}")
+        print(f"📊 Overall Status: {results['health_status'].upper()}\n")
+
+        # Pods
+        print("📦 Pods:")
+        print(f"   Total: {results['pods']['total']}")
+        print(f"   Running: {results['pods']['running']}")
+        print(f"   Pending: {results['pods']['pending']}")
+        print(f"   Failed: {results['pods']['failed']}")
+        if results['pods']['crashlooping'] > 0:
+            print(f"   ⚠️  CrashLooping: {results['pods']['crashlooping']}")
+        if results['pods']['image_pull_errors'] > 0:
+            print(f"   ⚠️  ImagePull Errors: {results['pods']['image_pull_errors']}")
+        print()
+
+        # Services
+        print("🌐 Services:")
+        print(f"   Total: {results['services']['total']}")
+        print(f"   With Endpoints: {results['services']['with_endpoints']}")
+        if results['services']['without_endpoints'] > 0:
+            print(f"   ⚠️  Without Endpoints: {results['services']['without_endpoints']}")
+        if results['services']['load_balancers_pending'] > 0:
+            print(f"   ⚠️  LB Pending: {results['services']['load_balancers_pending']}")
+        print()
+
+        # Deployments
+        if results['deployments']['total'] > 0:
+            print("🚀 Deployments:")
+            print(f"   Total: {results['deployments']['total']}")
+            print(f"   Available: {results['deployments']['available']}")
+            if results['deployments']['unavailable'] > 0:
+                print(f"   ⚠️  Unavailable: {results['deployments']['unavailable']}")
+            print()
+
+        # PVCs
+        if results['pvcs']['total'] > 0:
+            print("💾 PersistentVolumeClaims:")
+            print(f"   Total: {results['pvcs']['total']}")
+            print(f"   Bound: {results['pvcs']['bound']}")
+            if results['pvcs']['pending'] > 0:
+                print(f"   ⚠️  Pending: {results['pvcs']['pending']}")
+            print()
+
+        # Quotas
+        if results['quotas']['total'] > 0:
+            print("📏 Resource Quotas:")
+            print(f"   Total: {results['quotas']['total']}")
+            if results['quotas']['exceeded']:
+                print(f"   🚨 Exceeded: {', '.join(results['quotas']['exceeded'])}")
+            if results['quotas']['near_limit']:
+                print(f"   ⚠️  Near Limit: {', '.join(results['quotas']['near_limit'])}")
+            print()
+
+        # Issues
+        if total_issues > 0:
+            print(f"⚠️  Issues ({total_issues}):")
+            all_issues = (
+                results["pods"].get("issues", []) +
+                results["services"].get("issues", []) +
+                results["deployments"].get("issues", []) +
+                results["pvcs"].get("issues", []) +
+                results["quotas"].get("issues", [])
+            )
+            for issue in all_issues[:10]:  # Show first 10
+                print(f"   - {issue}")
+            if len(all_issues) > 10:
+                print(f"   ... and {len(all_issues) - 10} more (use --json for full list)")
+            print()
+
+        # Recommendations
+        print("💡 Recommendations:")
+        for rec in results["recommendations"]:
+            print(f"   {rec}")
+
+    sys.exit(0 if results["health_status"] in ["healthy", "degraded"] else 1)
+
+
+if __name__ == "__main__":
+    main()
--- a/skills/scripts/cluster_health.py
+++ b/skills/scripts/cluster_health.py
@@ -0,0 +1,223 @@
+#!/usr/bin/env python3
+"""
+Cluster Health Check Script
+Performs comprehensive cluster health diagnostics
+"""
+import json
+import subprocess
+from typing import Dict, List, Any
+from datetime import datetime
+
+def run_kubectl(args: List[str]) -> Dict[str, Any]:
+    """Run kubectl command and return parsed JSON"""
+    try:
+        result = subprocess.run(
+            ['kubectl'] + args,
+            capture_output=True,
+            text=True,
+            check=True
+        )
+        return json.loads(result.stdout) if result.stdout else {}
+    except subprocess.CalledProcessError as e:
+        return {"error": e.stderr}
+    except json.JSONDecodeError:
+        return {"error": "Failed to parse kubectl output"}
+
+def check_nodes() -> Dict[str, Any]:
+    """Check node health"""
+    nodes = run_kubectl(['get', 'nodes', '-o', 'json'])
+    if 'error' in nodes:
+        return nodes
+    
+    results = {
+        "healthy": 0,
+        "unhealthy": 0,
+        "issues": []
+    }
+    
+    for node in nodes.get('items', []):
+        name = node['metadata']['name']
+        conditions = node.get('status', {}).get('conditions', [])
+        
+        is_ready = False
+        for condition in conditions:
+            if condition['type'] == 'Ready':
+                is_ready = condition['status'] == 'True'
+                if not is_ready:
+                    results['unhealthy'] += 1
+                    results['issues'].append(f"Node {name} is not Ready")
+                else:
+                    results['healthy'] += 1
+                break
+        
+        # Check other conditions
+        for condition in conditions:
+            if condition['type'] != 'Ready' and condition['status'] == 'True':
+                results['issues'].append(f"Node {name}: {condition['type']} = {condition['status']}")
+    
+    return results
+
+def check_system_pods() -> Dict[str, Any]:
+    """Check critical system pods"""
+    namespaces = ['kube-system', 'kube-public', 'kube-node-lease']
+    results = {
+        "healthy": 0,
+        "unhealthy": 0,
+        "issues": []
+    }
+    
+    for ns in namespaces:
+        pods = run_kubectl(['get', 'pods', '-n', ns, '-o', 'json'])
+        if 'error' in pods:
+            continue
+            
+        for pod in pods.get('items', []):
+            name = pod['metadata']['name']
+            phase = pod.get('status', {}).get('phase', 'Unknown')
+            
+            if phase == 'Running':
+                # Check if all containers are ready
+                container_statuses = pod.get('status', {}).get('containerStatuses', [])
+                all_ready = all(c.get('ready', False) for c in container_statuses)
+                
+                if all_ready:
+                    results['healthy'] += 1
+                else:
+                    results['unhealthy'] += 1
+                    results['issues'].append(f"Pod {ns}/{name}: Containers not ready")
+            elif phase in ['Succeeded', 'Completed']:
+                results['healthy'] += 1
+            else:
+                results['unhealthy'] += 1
+                results['issues'].append(f"Pod {ns}/{name}: Phase is {phase}")
+    
+    return results
+
+def check_pending_pods() -> Dict[str, Any]:
+    """Check for pods stuck in pending"""
+    all_pods = run_kubectl(['get', 'pods', '--all-namespaces', '-o', 'json'])
+    if 'error' in all_pods:
+        return all_pods
+    
+    pending = []
+    for pod in all_pods.get('items', []):
+        if pod.get('status', {}).get('phase') == 'Pending':
+            name = pod['metadata']['name']
+            namespace = pod['metadata']['namespace']
+            pending.append(f"{namespace}/{name}")
+    
+    return {"count": len(pending), "pods": pending}
+
+def check_failed_pods() -> Dict[str, Any]:
+    """Check for failed pods"""
+    all_pods = run_kubectl(['get', 'pods', '--all-namespaces', '-o', 'json'])
+    if 'error' in all_pods:
+        return all_pods
+    
+    failed = []
+    for pod in all_pods.get('items', []):
+        if pod.get('status', {}).get('phase') == 'Failed':
+            name = pod['metadata']['name']
+            namespace = pod['metadata']['namespace']
+            failed.append(f"{namespace}/{name}")
+    
+    return {"count": len(failed), "pods": failed}
+
+def check_crashloop_pods() -> Dict[str, Any]:
+    """Check for pods in crash loop"""
+    all_pods = run_kubectl(['get', 'pods', '--all-namespaces', '-o', 'json'])
+    if 'error' in all_pods:
+        return all_pods
+    
+    crashloop = []
+    for pod in all_pods.get('items', []):
+        container_statuses = pod.get('status', {}).get('containerStatuses', [])
+        for container in container_statuses:
+            state = container.get('state', {})
+            if 'waiting' in state and 'CrashLoopBackOff' in state['waiting'].get('reason', ''):
+                name = pod['metadata']['name']
+                namespace = pod['metadata']['namespace']
+                container_name = container['name']
+                crashloop.append(f"{namespace}/{name} (container: {container_name})")
+                break
+    
+    return {"count": len(crashloop), "pods": crashloop}
+
+def main():
+    print("🏥 Kubernetes Cluster Health Check")
+    print("=" * 60)
+    print(f"Timestamp: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
+    
+    # Check nodes
+    print("🖥️  Node Health:")
+    nodes = check_nodes()
+    if 'error' not in nodes:
+        print(f"   ✅ Healthy nodes: {nodes['healthy']}")
+        if nodes['unhealthy'] > 0:
+            print(f"   ❌ Unhealthy nodes: {nodes['unhealthy']}")
+            for issue in nodes['issues']:
+                print(f"      • {issue}")
+    else:
+        print(f"   ❌ Error: {nodes['error']}")
+    print()
+    
+    # Check system pods
+    print("🔧 System Pods:")
+    system = check_system_pods()
+    if 'error' not in system:
+        print(f"   ✅ Healthy: {system['healthy']}")
+        if system['unhealthy'] > 0:
+            print(f"   ⚠️  Unhealthy: {system['unhealthy']}")
+            for issue in system['issues'][:10]:  # Show first 10
+                print(f"      • {issue}")
+    else:
+        print(f"   ❌ Error: {system['error']}")
+    print()
+    
+    # Check pending pods
+    print("⏳ Pending Pods:")
+    pending = check_pending_pods()
+    if 'error' not in pending:
+        if pending['count'] == 0:
+            print("   ✅ No pods stuck in pending")
+        else:
+            print(f"   ⚠️  {pending['count']} pods in pending state:")
+            for pod in pending['pods'][:10]:
+                print(f"      • {pod}")
+    else:
+        print(f"   ❌ Error: {pending['error']}")
+    print()
+    
+    # Check failed pods
+    print("💥 Failed Pods:")
+    failed = check_failed_pods()
+    if 'error' not in failed:
+        if failed['count'] == 0:
+            print("   ✅ No failed pods")
+        else:
+            print(f"   ❌ {failed['count']} pods in failed state:")
+            for pod in failed['pods'][:10]:
+                print(f"      • {pod}")
+    else:
+        print(f"   ❌ Error: {failed['error']}")
+    print()
+    
+    # Check crash loops
+    print("🔄 Crash Loop Pods:")
+    crashloop = check_crashloop_pods()
+    if 'error' not in crashloop:
+        if crashloop['count'] == 0:
+            print("   ✅ No pods in crash loop")
+        else:
+            print(f"   ❌ {crashloop['count']} pods in crash loop:")
+            for pod in crashloop['pods'][:10]:
+                print(f"      • {pod}")
+    else:
+        print(f"   ❌ Error: {crashloop['error']}")
+    print()
+    
+    print("=" * 60)
+    print("Health check complete!")
+
+if __name__ == "__main__":
+    main()
--- a/skills/scripts/diagnose_pod.py
+++ b/skills/scripts/diagnose_pod.py
@@ -0,0 +1,157 @@
+#!/usr/bin/env python3
+"""
+Comprehensive Pod Diagnostics Script
+Analyzes a pod's health and returns structured diagnostic information
+"""
+import json
+import subprocess
+import sys
+from typing import Dict, List, Any
+
+def run_kubectl(args: List[str]) -> Dict[str, Any]:
+    """Run kubectl command and return parsed JSON"""
+    try:
+        result = subprocess.run(
+            ['kubectl'] + args,
+            capture_output=True,
+            text=True,
+            check=True
+        )
+        return json.loads(result.stdout) if result.stdout else {}
+    except subprocess.CalledProcessError as e:
+        return {"error": e.stderr}
+    except json.JSONDecodeError:
+        return {"error": "Failed to parse kubectl output"}
+
+def check_pod_status(namespace: str, pod: str) -> Dict[str, Any]:
+    """Get pod status and basic info"""
+    return run_kubectl(['get', 'pod', pod, '-n', namespace, '-o', 'json'])
+
+def check_events(namespace: str, pod: str) -> Dict[str, Any]:
+    """Get events related to the pod"""
+    return run_kubectl(['get', 'events', '-n', namespace, 
+                       '--field-selector', f'involvedObject.name={pod}',
+                       '-o', 'json', '--sort-by', '.lastTimestamp'])
+
+def check_resource_usage(namespace: str, pod: str) -> Dict[str, Any]:
+    """Get resource usage if metrics server is available"""
+    result = run_kubectl(['top', 'pod', pod, '-n', namespace])
+    return result
+
+def analyze_pod(pod_data: Dict[str, Any]) -> Dict[str, Any]:
+    """Analyze pod data and identify issues"""
+    issues = []
+    recommendations = []
+    
+    status = pod_data.get('status', {})
+    spec = pod_data.get('spec', {})
+    
+    # Check phase
+    phase = status.get('phase', 'Unknown')
+    if phase not in ['Running', 'Succeeded']:
+        issues.append(f"Pod is in {phase} phase")
+    
+    # Check container statuses
+    container_statuses = status.get('containerStatuses', [])
+    for container in container_statuses:
+        name = container.get('name')
+        ready = container.get('ready', False)
+        
+        if not ready:
+            issues.append(f"Container {name} is not ready")
+            
+        state = container.get('state', {})
+        if 'waiting' in state:
+            reason = state['waiting'].get('reason', 'Unknown')
+            message = state['waiting'].get('message', '')
+            issues.append(f"Container {name} waiting: {reason} - {message}")
+            
+            if reason == 'ImagePullBackOff':
+                recommendations.append("Check image name and registry credentials")
+            elif reason == 'CrashLoopBackOff':
+                recommendations.append(f"Check logs for container {name} to identify crash cause")
+        
+        if 'terminated' in state:
+            reason = state['terminated'].get('reason', 'Unknown')
+            exit_code = state['terminated'].get('exitCode', 0)
+            issues.append(f"Container {name} terminated: {reason} (exit code {exit_code})")
+            
+        restart_count = container.get('restartCount', 0)
+        if restart_count > 5:
+            issues.append(f"Container {name} has restarted {restart_count} times")
+            recommendations.append(f"Investigate crash loops in container {name}")
+    
+    # Check resource requests/limits
+    for container in spec.get('containers', []):
+        resources = container.get('resources', {})
+        if not resources.get('requests'):
+            recommendations.append(f"Consider setting resource requests for container {container.get('name')}")
+        if not resources.get('limits'):
+            recommendations.append(f"Consider setting resource limits for container {container.get('name')}")
+    
+    # Check restart policy
+    restart_policy = spec.get('restartPolicy', 'Always')
+    if restart_policy == 'Never' and issues:
+        recommendations.append("Restart policy is 'Never' - pod won't restart automatically")
+    
+    return {
+        "phase": phase,
+        "issues": issues,
+        "recommendations": recommendations
+    }
+
+def main():
+    if len(sys.argv) != 3:
+        print("Usage: diagnose_pod.py <namespace> <pod-name>")
+        sys.exit(1)
+    
+    namespace = sys.argv[1]
+    pod = sys.argv[2]
+    
+    print(f"🔍 Diagnosing pod: {pod} in namespace: {namespace}\n")
+    
+    # Get pod details
+    pod_data = check_pod_status(namespace, pod)
+    if 'error' in pod_data:
+        print(f"❌ Error fetching pod: {pod_data['error']}")
+        sys.exit(1)
+    
+    # Analyze pod
+    analysis = analyze_pod(pod_data)
+    
+    print(f"📊 Pod Phase: {analysis['phase']}\n")
+    
+    if analysis['issues']:
+        print("⚠️  Issues Found:")
+        for issue in analysis['issues']:
+            print(f"   • {issue}")
+        print()
+    else:
+        print("✅ No issues detected\n")
+    
+    if analysis['recommendations']:
+        print("💡 Recommendations:")
+        for rec in analysis['recommendations']:
+            print(f"   • {rec}")
+        print()
+    
+    # Get events
+    events_data = check_events(namespace, pod)
+    if 'items' in events_data and events_data['items']:
+        print("📋 Recent Events:")
+        for event in events_data['items'][-5:]:  # Last 5 events
+            msg = event.get('message', '')
+            reason = event.get('reason', '')
+            print(f"   • {reason}: {msg}")
+        print()
+    
+    # Try to get resource usage
+    print("📈 Resource Usage:")
+    resource_data = check_resource_usage(namespace, pod)
+    if 'error' not in resource_data:
+        print("   (Run 'kubectl top pod' manually for current usage)")
+    else:
+        print("   Metrics server not available")
+
+if __name__ == "__main__":
+    main()